From 2bb7dbbb8e77757d97c2fd0ce55b818cf77e111a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 18 Jul 2024 14:39:41 +0000
Subject: [PATCH] Deployed 73959ef with MkDocs version: 1.6.0

---
 .nojekyll                                     |    0
 404.html                                      |  584 ++
 add-your-own-data/index.html                  |  857 +++
 api/base_dataset/index.html                   | 3901 ++++++++++
 api/config/index.html                         |  980 +++
 api/hf_dataset/index.html                     |  992 +++
 api/jsonl_dataset/index.html                  | 1082 +++
 assets/_mkdocstrings.css                      |  119 +
 assets/images/favicon.png                     |  Bin 0 -> 1870 bytes
 assets/javascripts/bundle.fe8b6f2b.min.js     |   29 +
 assets/javascripts/bundle.fe8b6f2b.min.js.map |    7 +
 assets/javascripts/lunr/min/lunr.ar.min.js    |    1 +
 assets/javascripts/lunr/min/lunr.da.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.de.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.du.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.el.min.js    |    1 +
 assets/javascripts/lunr/min/lunr.es.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.fi.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.fr.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.he.min.js    |    1 +
 assets/javascripts/lunr/min/lunr.hi.min.js    |    1 +
 assets/javascripts/lunr/min/lunr.hu.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.hy.min.js    |    1 +
 assets/javascripts/lunr/min/lunr.it.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.ja.min.js    |    1 +
 assets/javascripts/lunr/min/lunr.jp.min.js    |    1 +
 assets/javascripts/lunr/min/lunr.kn.min.js    |    1 +
 assets/javascripts/lunr/min/lunr.ko.min.js    |    1 +
 assets/javascripts/lunr/min/lunr.multi.min.js |    1 +
 assets/javascripts/lunr/min/lunr.nl.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.no.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.pt.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.ro.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.ru.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.sa.min.js    |    1 +
 .../lunr/min/lunr.stemmer.support.min.js      |    1 +
 assets/javascripts/lunr/min/lunr.sv.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.ta.min.js    |    1 +
 assets/javascripts/lunr/min/lunr.te.min.js    |    1 +
 assets/javascripts/lunr/min/lunr.th.min.js    |    1 +
 assets/javascripts/lunr/min/lunr.tr.min.js    |   18 +
 assets/javascripts/lunr/min/lunr.vi.min.js    |    1 +
 assets/javascripts/lunr/min/lunr.zh.min.js    |    1 +
 assets/javascripts/lunr/tinyseg.js            |  206 +
 assets/javascripts/lunr/wordcut.js            | 6708 +++++++++++++++++
 .../workers/search.b8dbb3d2.min.js            |   42 +
 .../workers/search.b8dbb3d2.min.js.map        |    7 +
 assets/stylesheets/main.76a95c52.min.css      |    1 +
 assets/stylesheets/main.76a95c52.min.css.map  |    1 +
 assets/stylesheets/palette.06af60db.min.css   |    1 +
 .../stylesheets/palette.06af60db.min.css.map  |    1 +
 compose-train-validation-data/index.html      |  632 ++
 config-files/index.html                       |  744 ++
 datasets/index.html                           | 1625 ++++
 datasets/language_af/index.html               | 1197 +++
 datasets/language_am/index.html               | 1197 +++
 datasets/language_an/index.html               | 1149 +++
 datasets/language_ar/index.html               | 1197 +++
 datasets/language_arz/index.html              | 1149 +++
 datasets/language_as/index.html               | 1149 +++
 datasets/language_ast/index.html              | 1149 +++
 datasets/language_av/index.html               | 1149 +++
 datasets/language_az/index.html               | 1149 +++
 datasets/language_azb/index.html              | 1149 +++
 datasets/language_ba/index.html               | 1149 +++
 datasets/language_be/index.html               | 1149 +++
 datasets/language_bg/index.html               | 1633 ++++
 datasets/language_bh/index.html               | 1149 +++
 datasets/language_bn/index.html               | 1149 +++
 datasets/language_bo/index.html               | 1149 +++
 datasets/language_bpy/index.html              | 1149 +++
 datasets/language_br/index.html               | 1149 +++
 datasets/language_bs/index.html               | 1149 +++
 datasets/language_bxr/index.html              | 1149 +++
 datasets/language_ca/index.html               | 1457 ++++
 datasets/language_ce/index.html               | 1149 +++
 datasets/language_ceb/index.html              | 1149 +++
 datasets/language_ckb/index.html              | 1149 +++
 datasets/language_code/index.html             | 4669 ++++++++++++
 datasets/language_cs/index.html               | 1545 ++++
 datasets/language_cv/index.html               | 1149 +++
 datasets/language_cy/index.html               | 1149 +++
 datasets/language_da/index.html               | 1545 ++++
 datasets/language_de/index.html               | 1589 ++++
 datasets/language_dsb/index.html              | 1149 +++
 datasets/language_dv/index.html               | 1149 +++
 datasets/language_el/index.html               | 1633 ++++
 datasets/language_en/index.html               | 2837 +++++++
 datasets/language_eo/index.html               | 1149 +++
 datasets/language_es/index.html               | 1545 ++++
 datasets/language_et/index.html               | 1545 ++++
 datasets/language_eu/index.html               | 1457 ++++
 datasets/language_fa/index.html               | 1149 +++
 datasets/language_fi/index.html               | 1545 ++++
 datasets/language_fr/index.html               | 1593 ++++
 datasets/language_fy/index.html               | 1149 +++
 datasets/language_ga/index.html               | 1457 ++++
 datasets/language_gd/index.html               | 1149 +++
 datasets/language_gl/index.html               | 1369 ++++
 datasets/language_gn/index.html               | 1149 +++
 datasets/language_gom/index.html              | 1149 +++
 datasets/language_gsw/index.html              | 1149 +++
 datasets/language_gu/index.html               | 1149 +++
 datasets/language_ha/index.html               |  669 ++
 datasets/language_he/index.html               | 1149 +++
 datasets/language_hi/index.html               | 1149 +++
 datasets/language_hr/index.html               | 1589 ++++
 datasets/language_hsb/index.html              | 1149 +++
 datasets/language_ht/index.html               | 1149 +++
 datasets/language_hu/index.html               | 1501 ++++
 datasets/language_hy/index.html               | 1149 +++
 datasets/language_ia/index.html               | 1149 +++
 datasets/language_id/index.html               | 1149 +++
 datasets/language_ie/index.html               | 1149 +++
 datasets/language_ig/index.html               |  669 ++
 datasets/language_ilo/index.html              | 1149 +++
 datasets/language_io/index.html               | 1149 +++
 datasets/language_is/index.html               | 1149 +++
 datasets/language_it/index.html               | 1545 ++++
 datasets/language_ja/index.html               | 1149 +++
 datasets/language_jbo/index.html              | 1149 +++
 datasets/language_jv/index.html               | 1149 +++
 datasets/language_ka/index.html               | 1149 +++
 datasets/language_kk/index.html               | 1149 +++
 datasets/language_km/index.html               | 1149 +++
 datasets/language_kn/index.html               | 1149 +++
 datasets/language_ko/index.html               | 1149 +++
 datasets/language_krc/index.html              | 1149 +++
 datasets/language_ku/index.html               | 1149 +++
 datasets/language_kv/index.html               | 1149 +++
 datasets/language_kw/index.html               | 1149 +++
 datasets/language_ky/index.html               | 1197 +++
 datasets/language_la/index.html               | 1149 +++
 datasets/language_lb/index.html               | 1149 +++
 datasets/language_lez/index.html              | 1149 +++
 datasets/language_li/index.html               | 1149 +++
 datasets/language_lmo/index.html              | 1149 +++
 datasets/language_lo/index.html               | 1149 +++
 datasets/language_lt/index.html               | 1457 ++++
 datasets/language_lv/index.html               | 1369 ++++
 datasets/language_mai/index.html              | 1149 +++
 datasets/language_mg/index.html               | 1149 +++
 datasets/language_mhr/index.html              | 1149 +++
 datasets/language_min/index.html              | 1149 +++
 datasets/language_mk/index.html               | 1149 +++
 datasets/language_ml/index.html               | 1149 +++
 datasets/language_mn/index.html               | 1149 +++
 datasets/language_mr/index.html               | 1149 +++
 datasets/language_mrj/index.html              | 1149 +++
 datasets/language_ms/index.html               | 1149 +++
 datasets/language_mt/index.html               | 1369 ++++
 datasets/language_multi/index.html            | 1149 +++
 datasets/language_mwl/index.html              | 1149 +++
 datasets/language_my/index.html               | 1149 +++
 datasets/language_mzn/index.html              | 1149 +++
 datasets/language_nah/index.html              | 1149 +++
 datasets/language_nds/index.html              | 1149 +++
 datasets/language_ne/index.html               | 1149 +++
 datasets/language_new/index.html              | 1149 +++
 datasets/language_nl/index.html               | 1765 +++++
 datasets/language_nn/index.html               | 1281 ++++
 datasets/language_no/index.html               | 1457 ++++
 datasets/language_ny/index.html               |  669 ++
 datasets/language_oc/index.html               | 1149 +++
 datasets/language_om/index.html               |  669 ++
 datasets/language_or/index.html               | 1149 +++
 datasets/language_os/index.html               | 1149 +++
 datasets/language_pa/index.html               | 1149 +++
 datasets/language_pl/index.html               | 1633 ++++
 datasets/language_pms/index.html              | 1149 +++
 datasets/language_pnb/index.html              | 1149 +++
 datasets/language_ps/index.html               | 1149 +++
 datasets/language_pt/index.html               | 1653 ++++
 datasets/language_qu/index.html               | 1149 +++
 datasets/language_ro/index.html               | 1589 ++++
 datasets/language_ru/index.html               | 1149 +++
 datasets/language_rw/index.html               |  669 ++
 datasets/language_sa/index.html               | 1149 +++
 datasets/language_sah/index.html              | 1149 +++
 datasets/language_sd/index.html               | 1149 +++
 datasets/language_sh/index.html               | 1193 +++
 datasets/language_si/index.html               | 1149 +++
 datasets/language_sk/index.html               | 1545 ++++
 datasets/language_sl/index.html               | 1633 ++++
 datasets/language_sn/index.html               |  669 ++
 datasets/language_so/index.html               | 1197 +++
 datasets/language_sq/index.html               | 1149 +++
 datasets/language_sr/index.html               | 1457 ++++
 datasets/language_st/index.html               |  669 ++
 datasets/language_su/index.html               | 1149 +++
 datasets/language_sv/index.html               | 1545 ++++
 datasets/language_sw/index.html               | 1197 +++
 datasets/language_ta/index.html               | 1149 +++
 datasets/language_te/index.html               | 1149 +++
 datasets/language_tg/index.html               | 1149 +++
 datasets/language_th/index.html               | 1149 +++
 datasets/language_ti/index.html               |  669 ++
 datasets/language_tk/index.html               | 1149 +++
 datasets/language_tl/index.html               | 1149 +++
 datasets/language_tr/index.html               | 1149 +++
 datasets/language_tt/index.html               | 1149 +++
 datasets/language_ug/index.html               | 1149 +++
 datasets/language_uk/index.html               | 1501 ++++
 datasets/language_ur/index.html               | 1149 +++
 datasets/language_uz/index.html               | 1149 +++
 datasets/language_vi/index.html               | 1149 +++
 datasets/language_vo/index.html               | 1149 +++
 datasets/language_wa/index.html               | 1149 +++
 datasets/language_war/index.html              | 1149 +++
 datasets/language_wuu/index.html              | 1149 +++
 datasets/language_x-eml/index.html            | 1149 +++
 datasets/language_xal/index.html              | 1149 +++
 datasets/language_xh/index.html               |  669 ++
 datasets/language_xmf/index.html              | 1149 +++
 datasets/language_yi/index.html               | 1149 +++
 datasets/language_yo/index.html               | 1197 +++
 datasets/language_zh/index.html               | 1149 +++
 datasets/language_zu/index.html               |  669 ++
 datasets/tokens_by_language.png               |  Bin 0 -> 21391 bytes
 datasets/tokens_by_source.png                 |  Bin 0 -> 41755 bytes
 extract-text-data/index.html                  |  683 ++
 getting-started/index.html                    |  805 ++
 ...a_pile_of_books__whit-removebg-preview.png |  Bin 0 -> 173165 bytes
 images/data-schema.svg                        |    1 +
 images/favicon-16x16.png                      |  Bin 0 -> 656 bytes
 images/favicon-32x32.png                      |  Bin 0 -> 1685 bytes
 images/favicon.ico                            |  Bin 0 -> 15406 bytes
 images/pipeline.svg                           |    1 +
 index.html                                    |  629 ++
 objects.inv                                   |    6 +
 overview/index.html                           |  703 ++
 related-work/index.html                       |  631 ++
 search/search_index.json                      |    1 +
 sitemap.xml                                   |  888 +++
 sitemap.xml.gz                                |  Bin 0 -> 894 bytes
 235 files changed, 223710 insertions(+)
 create mode 100644 .nojekyll
 create mode 100644 404.html
 create mode 100644 add-your-own-data/index.html
 create mode 100644 api/base_dataset/index.html
 create mode 100644 api/config/index.html
 create mode 100644 api/hf_dataset/index.html
 create mode 100644 api/jsonl_dataset/index.html
 create mode 100644 assets/_mkdocstrings.css
 create mode 100644 assets/images/favicon.png
 create mode 100644 assets/javascripts/bundle.fe8b6f2b.min.js
 create mode 100644 assets/javascripts/bundle.fe8b6f2b.min.js.map
 create mode 100644 assets/javascripts/lunr/min/lunr.ar.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.da.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.de.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.du.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.el.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.es.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.fi.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.fr.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.he.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.hi.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.hu.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.hy.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.it.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.ja.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.jp.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.kn.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.ko.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.multi.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.nl.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.no.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.pt.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.ro.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.ru.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.sa.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.stemmer.support.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.sv.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.ta.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.te.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.th.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.tr.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.vi.min.js
 create mode 100644 assets/javascripts/lunr/min/lunr.zh.min.js
 create mode 100644 assets/javascripts/lunr/tinyseg.js
 create mode 100644 assets/javascripts/lunr/wordcut.js
 create mode 100644 assets/javascripts/workers/search.b8dbb3d2.min.js
 create mode 100644 assets/javascripts/workers/search.b8dbb3d2.min.js.map
 create mode 100644 assets/stylesheets/main.76a95c52.min.css
 create mode 100644 assets/stylesheets/main.76a95c52.min.css.map
 create mode 100644 assets/stylesheets/palette.06af60db.min.css
 create mode 100644 assets/stylesheets/palette.06af60db.min.css.map
 create mode 100644 compose-train-validation-data/index.html
 create mode 100644 config-files/index.html
 create mode 100644 datasets/index.html
 create mode 100644 datasets/language_af/index.html
 create mode 100644 datasets/language_am/index.html
 create mode 100644 datasets/language_an/index.html
 create mode 100644 datasets/language_ar/index.html
 create mode 100644 datasets/language_arz/index.html
 create mode 100644 datasets/language_as/index.html
 create mode 100644 datasets/language_ast/index.html
 create mode 100644 datasets/language_av/index.html
 create mode 100644 datasets/language_az/index.html
 create mode 100644 datasets/language_azb/index.html
 create mode 100644 datasets/language_ba/index.html
 create mode 100644 datasets/language_be/index.html
 create mode 100644 datasets/language_bg/index.html
 create mode 100644 datasets/language_bh/index.html
 create mode 100644 datasets/language_bn/index.html
 create mode 100644 datasets/language_bo/index.html
 create mode 100644 datasets/language_bpy/index.html
 create mode 100644 datasets/language_br/index.html
 create mode 100644 datasets/language_bs/index.html
 create mode 100644 datasets/language_bxr/index.html
 create mode 100644 datasets/language_ca/index.html
 create mode 100644 datasets/language_ce/index.html
 create mode 100644 datasets/language_ceb/index.html
 create mode 100644 datasets/language_ckb/index.html
 create mode 100644 datasets/language_code/index.html
 create mode 100644 datasets/language_cs/index.html
 create mode 100644 datasets/language_cv/index.html
 create mode 100644 datasets/language_cy/index.html
 create mode 100644 datasets/language_da/index.html
 create mode 100644 datasets/language_de/index.html
 create mode 100644 datasets/language_dsb/index.html
 create mode 100644 datasets/language_dv/index.html
 create mode 100644 datasets/language_el/index.html
 create mode 100644 datasets/language_en/index.html
 create mode 100644 datasets/language_eo/index.html
 create mode 100644 datasets/language_es/index.html
 create mode 100644 datasets/language_et/index.html
 create mode 100644 datasets/language_eu/index.html
 create mode 100644 datasets/language_fa/index.html
 create mode 100644 datasets/language_fi/index.html
 create mode 100644 datasets/language_fr/index.html
 create mode 100644 datasets/language_fy/index.html
 create mode 100644 datasets/language_ga/index.html
 create mode 100644 datasets/language_gd/index.html
 create mode 100644 datasets/language_gl/index.html
 create mode 100644 datasets/language_gn/index.html
 create mode 100644 datasets/language_gom/index.html
 create mode 100644 datasets/language_gsw/index.html
 create mode 100644 datasets/language_gu/index.html
 create mode 100644 datasets/language_ha/index.html
 create mode 100644 datasets/language_he/index.html
 create mode 100644 datasets/language_hi/index.html
 create mode 100644 datasets/language_hr/index.html
 create mode 100644 datasets/language_hsb/index.html
 create mode 100644 datasets/language_ht/index.html
 create mode 100644 datasets/language_hu/index.html
 create mode 100644 datasets/language_hy/index.html
 create mode 100644 datasets/language_ia/index.html
 create mode 100644 datasets/language_id/index.html
 create mode 100644 datasets/language_ie/index.html
 create mode 100644 datasets/language_ig/index.html
 create mode 100644 datasets/language_ilo/index.html
 create mode 100644 datasets/language_io/index.html
 create mode 100644 datasets/language_is/index.html
 create mode 100644 datasets/language_it/index.html
 create mode 100644 datasets/language_ja/index.html
 create mode 100644 datasets/language_jbo/index.html
 create mode 100644 datasets/language_jv/index.html
 create mode 100644 datasets/language_ka/index.html
 create mode 100644 datasets/language_kk/index.html
 create mode 100644 datasets/language_km/index.html
 create mode 100644 datasets/language_kn/index.html
 create mode 100644 datasets/language_ko/index.html
 create mode 100644 datasets/language_krc/index.html
 create mode 100644 datasets/language_ku/index.html
 create mode 100644 datasets/language_kv/index.html
 create mode 100644 datasets/language_kw/index.html
 create mode 100644 datasets/language_ky/index.html
 create mode 100644 datasets/language_la/index.html
 create mode 100644 datasets/language_lb/index.html
 create mode 100644 datasets/language_lez/index.html
 create mode 100644 datasets/language_li/index.html
 create mode 100644 datasets/language_lmo/index.html
 create mode 100644 datasets/language_lo/index.html
 create mode 100644 datasets/language_lt/index.html
 create mode 100644 datasets/language_lv/index.html
 create mode 100644 datasets/language_mai/index.html
 create mode 100644 datasets/language_mg/index.html
 create mode 100644 datasets/language_mhr/index.html
 create mode 100644 datasets/language_min/index.html
 create mode 100644 datasets/language_mk/index.html
 create mode 100644 datasets/language_ml/index.html
 create mode 100644 datasets/language_mn/index.html
 create mode 100644 datasets/language_mr/index.html
 create mode 100644 datasets/language_mrj/index.html
 create mode 100644 datasets/language_ms/index.html
 create mode 100644 datasets/language_mt/index.html
 create mode 100644 datasets/language_multi/index.html
 create mode 100644 datasets/language_mwl/index.html
 create mode 100644 datasets/language_my/index.html
 create mode 100644 datasets/language_mzn/index.html
 create mode 100644 datasets/language_nah/index.html
 create mode 100644 datasets/language_nds/index.html
 create mode 100644 datasets/language_ne/index.html
 create mode 100644 datasets/language_new/index.html
 create mode 100644 datasets/language_nl/index.html
 create mode 100644 datasets/language_nn/index.html
 create mode 100644 datasets/language_no/index.html
 create mode 100644 datasets/language_ny/index.html
 create mode 100644 datasets/language_oc/index.html
 create mode 100644 datasets/language_om/index.html
 create mode 100644 datasets/language_or/index.html
 create mode 100644 datasets/language_os/index.html
 create mode 100644 datasets/language_pa/index.html
 create mode 100644 datasets/language_pl/index.html
 create mode 100644 datasets/language_pms/index.html
 create mode 100644 datasets/language_pnb/index.html
 create mode 100644 datasets/language_ps/index.html
 create mode 100644 datasets/language_pt/index.html
 create mode 100644 datasets/language_qu/index.html
 create mode 100644 datasets/language_ro/index.html
 create mode 100644 datasets/language_ru/index.html
 create mode 100644 datasets/language_rw/index.html
 create mode 100644 datasets/language_sa/index.html
 create mode 100644 datasets/language_sah/index.html
 create mode 100644 datasets/language_sd/index.html
 create mode 100644 datasets/language_sh/index.html
 create mode 100644 datasets/language_si/index.html
 create mode 100644 datasets/language_sk/index.html
 create mode 100644 datasets/language_sl/index.html
 create mode 100644 datasets/language_sn/index.html
 create mode 100644 datasets/language_so/index.html
 create mode 100644 datasets/language_sq/index.html
 create mode 100644 datasets/language_sr/index.html
 create mode 100644 datasets/language_st/index.html
 create mode 100644 datasets/language_su/index.html
 create mode 100644 datasets/language_sv/index.html
 create mode 100644 datasets/language_sw/index.html
 create mode 100644 datasets/language_ta/index.html
 create mode 100644 datasets/language_te/index.html
 create mode 100644 datasets/language_tg/index.html
 create mode 100644 datasets/language_th/index.html
 create mode 100644 datasets/language_ti/index.html
 create mode 100644 datasets/language_tk/index.html
 create mode 100644 datasets/language_tl/index.html
 create mode 100644 datasets/language_tr/index.html
 create mode 100644 datasets/language_tt/index.html
 create mode 100644 datasets/language_ug/index.html
 create mode 100644 datasets/language_uk/index.html
 create mode 100644 datasets/language_ur/index.html
 create mode 100644 datasets/language_uz/index.html
 create mode 100644 datasets/language_vi/index.html
 create mode 100644 datasets/language_vo/index.html
 create mode 100644 datasets/language_wa/index.html
 create mode 100644 datasets/language_war/index.html
 create mode 100644 datasets/language_wuu/index.html
 create mode 100644 datasets/language_x-eml/index.html
 create mode 100644 datasets/language_xal/index.html
 create mode 100644 datasets/language_xh/index.html
 create mode 100644 datasets/language_xmf/index.html
 create mode 100644 datasets/language_yi/index.html
 create mode 100644 datasets/language_yo/index.html
 create mode 100644 datasets/language_zh/index.html
 create mode 100644 datasets/language_zu/index.html
 create mode 100644 datasets/tokens_by_language.png
 create mode 100644 datasets/tokens_by_source.png
 create mode 100644 extract-text-data/index.html
 create mode 100644 getting-started/index.html
 create mode 100644 images/A_colorful_parrot_sitting_on_a_pile_of_books__whit-removebg-preview.png
 create mode 100644 images/data-schema.svg
 create mode 100644 images/favicon-16x16.png
 create mode 100644 images/favicon-32x32.png
 create mode 100644 images/favicon.ico
 create mode 100644 images/pipeline.svg
 create mode 100644 index.html
 create mode 100644 objects.inv
 create mode 100644 overview/index.html
 create mode 100644 related-work/index.html
 create mode 100644 search/search_index.json
 create mode 100644 sitemap.xml
 create mode 100644 sitemap.xml.gz

diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 0000000..e69de29
diff --git a/404.html b/404.html
new file mode 100644
index 0000000..d52da84
--- /dev/null
+++ b/404.html
@@ -0,0 +1,584 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+      
+      
+      
+      <link rel="icon" href="/malteos/llm-datasets/assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="/malteos/llm-datasets/assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="/malteos/llm-datasets/assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("/malteos/llm-datasets/",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="/malteos/llm-datasets/." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="/malteos/llm-datasets/." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/malteos/llm-datasets/." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/malteos/llm-datasets/getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/malteos/llm-datasets/overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/malteos/llm-datasets/datasets/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/malteos/llm-datasets/config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/malteos/llm-datasets/extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/malteos/llm-datasets/add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/malteos/llm-datasets/compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/malteos/llm-datasets/related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/malteos/llm-datasets/api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/malteos/llm-datasets/api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/malteos/llm-datasets/api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/malteos/llm-datasets/api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+  <h1>404 - Not found</h1>
+
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "/malteos/llm-datasets/", "features": [], "search": "/malteos/llm-datasets/assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="/malteos/llm-datasets/assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/add-your-own-data/index.html b/add-your-own-data/index.html
new file mode 100644
index 0000000..e11530d
--- /dev/null
+++ b/add-your-own-data/index.html
@@ -0,0 +1,857 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/add-your-own-data/">
+      
+      
+        <link rel="prev" href="../extract-text-data/">
+      
+      
+        <link rel="next" href="../compose-train-validation-data/">
+      
+      
+      <link rel="icon" href="../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Adding your own data - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#integrate-a-custom-dataset" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href=".." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Adding your own data
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href=".." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href=".." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../datasets/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+        
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#write-a-dataset-class" class="md-nav__link">
+    <span class="md-ellipsis">
+      Write a dataset class
+    </span>
+  </a>
+  
+    <nav class="md-nav" aria-label="Write a dataset class">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#huggingface-dataset" class="md-nav__link">
+    <span class="md-ellipsis">
+      Huggingface dataset
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#csv-dataset" class="md-nav__link">
+    <span class="md-ellipsis">
+      CSV dataset
+    </span>
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#register-new-dataset-classes" class="md-nav__link">
+    <span class="md-ellipsis">
+      Register new dataset classes
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#load-registry-in-commands" class="md-nav__link">
+    <span class="md-ellipsis">
+      Load registry in commands
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#write-a-dataset-class" class="md-nav__link">
+    <span class="md-ellipsis">
+      Write a dataset class
+    </span>
+  </a>
+  
+    <nav class="md-nav" aria-label="Write a dataset class">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#huggingface-dataset" class="md-nav__link">
+    <span class="md-ellipsis">
+      Huggingface dataset
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#csv-dataset" class="md-nav__link">
+    <span class="md-ellipsis">
+      CSV dataset
+    </span>
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#register-new-dataset-classes" class="md-nav__link">
+    <span class="md-ellipsis">
+      Register new dataset classes
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#load-registry-in-commands" class="md-nav__link">
+    <span class="md-ellipsis">
+      Load registry in commands
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="integrate-a-custom-dataset">Integrate a custom dataset</h1>
+<h2 id="write-a-dataset-class">Write a dataset class</h2>
+<p>The first step for adding a new dataset is write a new dataset class.
+If your data comes from a common source such as Huggingface, you can build upon existing abstractions.</p>
+<h3 id="huggingface-dataset">Huggingface dataset</h3>
+<p>For example, Huggingface datasets only needed to specify some metadata like dataset ID, title etc. and the column where the textual data can be extracted from (by default <code>text</code> column):</p>
+<div class="language-python highlight"><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a><span class="c1"># my_datasets/pg19.py</span>
+</span><span id="__span-0-2"><a id="__codelineno-0-2" name="__codelineno-0-2" href="#__codelineno-0-2"></a>
+</span><span id="__span-0-3"><a id="__codelineno-0-3" name="__codelineno-0-3" href="#__codelineno-0-3"></a><span class="kn">from</span> <span class="nn">llm_datasets.datasets.hf_dataset</span> <span class="kn">import</span> <span class="n">HFDataset</span>
+</span><span id="__span-0-4"><a id="__codelineno-0-4" name="__codelineno-0-4" href="#__codelineno-0-4"></a><span class="kn">from</span> <span class="nn">llm_datasets.datasets.base</span> <span class="kn">import</span> <span class="n">License</span><span class="p">,</span> <span class="n">Availability</span>
+</span><span id="__span-0-5"><a id="__codelineno-0-5" name="__codelineno-0-5" href="#__codelineno-0-5"></a>
+</span><span id="__span-0-6"><a id="__codelineno-0-6" name="__codelineno-0-6" href="#__codelineno-0-6"></a><span class="k">class</span> <span class="nc">PG19Dataset</span><span class="p">(</span><span class="n">HFDataset</span><span class="p">):</span>
+</span><span id="__span-0-7"><a id="__codelineno-0-7" name="__codelineno-0-7" href="#__codelineno-0-7"></a>    <span class="n">DATASET_ID</span> <span class="o">=</span> <span class="s2">&quot;pg19&quot;</span>
+</span><span id="__span-0-8"><a id="__codelineno-0-8" name="__codelineno-0-8" href="#__codelineno-0-8"></a>    <span class="n">TITLE</span> <span class="o">=</span> <span class="s2">&quot;Project Gutenberg books published before 1919&quot;</span>
+</span><span id="__span-0-9"><a id="__codelineno-0-9" name="__codelineno-0-9" href="#__codelineno-0-9"></a>    <span class="n">HOMEPAGE</span> <span class="o">=</span> <span class="s2">&quot;https://huggingface.co/datasets/pg19&quot;</span>
+</span><span id="__span-0-10"><a id="__codelineno-0-10" name="__codelineno-0-10" href="#__codelineno-0-10"></a>    <span class="n">LICENSE</span> <span class="o">=</span> <span class="n">License</span><span class="p">(</span><span class="s2">&quot;Apache License Version 2.0 (or public domain?)&quot;</span><span class="p">,</span> <span class="n">url</span><span class="o">=</span><span class="s2">&quot;https://www.apache.org/licenses/LICENSE-2.0.html&quot;</span><span class="p">)</span>
+</span><span id="__span-0-11"><a id="__codelineno-0-11" name="__codelineno-0-11" href="#__codelineno-0-11"></a>    <span class="n">CITATION</span> <span class="o">=</span> <span class="sa">r</span><span class="s2">&quot;&quot;&quot;@article{raecompressive2019,</span>
+</span><span id="__span-0-12"><a id="__codelineno-0-12" name="__codelineno-0-12" href="#__codelineno-0-12"></a><span class="s2">        author = {Rae, Jack W and Potapenko, Anna and Jayakumar, Siddhant M and</span>
+</span><span id="__span-0-13"><a id="__codelineno-0-13" name="__codelineno-0-13" href="#__codelineno-0-13"></a><span class="s2">                    Hillier, Chloe and Lillicrap, Timothy P},</span>
+</span><span id="__span-0-14"><a id="__codelineno-0-14" name="__codelineno-0-14" href="#__codelineno-0-14"></a><span class="s2">        title = {Compressive Transformers for Long-Range Sequence Modelling},</span>
+</span><span id="__span-0-15"><a id="__codelineno-0-15" name="__codelineno-0-15" href="#__codelineno-0-15"></a><span class="s2">        journal = {arXiv preprint},</span>
+</span><span id="__span-0-16"><a id="__codelineno-0-16" name="__codelineno-0-16" href="#__codelineno-0-16"></a><span class="s2">        url = {https://arxiv.org/abs/1911.05507},</span>
+</span><span id="__span-0-17"><a id="__codelineno-0-17" name="__codelineno-0-17" href="#__codelineno-0-17"></a><span class="s2">        year = </span><span class="si">{2019}</span><span class="s2">,</span>
+</span><span id="__span-0-18"><a id="__codelineno-0-18" name="__codelineno-0-18" href="#__codelineno-0-18"></a><span class="s2">        }</span>
+</span><span id="__span-0-19"><a id="__codelineno-0-19" name="__codelineno-0-19" href="#__codelineno-0-19"></a><span class="s2">        &quot;&quot;&quot;</span>  <span class="c1"># noqa</span>
+</span><span id="__span-0-20"><a id="__codelineno-0-20" name="__codelineno-0-20" href="#__codelineno-0-20"></a>    <span class="n">AVAILIBILITY</span> <span class="o">=</span> <span class="n">Availability</span><span class="o">.</span><span class="n">DIRECT_DOWNLOAD</span>
+</span><span id="__span-0-21"><a id="__codelineno-0-21" name="__codelineno-0-21" href="#__codelineno-0-21"></a>
+</span><span id="__span-0-22"><a id="__codelineno-0-22" name="__codelineno-0-22" href="#__codelineno-0-22"></a>    <span class="n">HF_DATASET_ID</span> <span class="o">=</span> <span class="s2">&quot;pg19&quot;</span>
+</span><span id="__span-0-23"><a id="__codelineno-0-23" name="__codelineno-0-23" href="#__codelineno-0-23"></a>    <span class="n">HF_DATASET_SPLIT</span> <span class="o">=</span> <span class="s2">&quot;train&quot;</span>
+</span><span id="__span-0-24"><a id="__codelineno-0-24" name="__codelineno-0-24" href="#__codelineno-0-24"></a>    <span class="n">streaming</span> <span class="o">=</span> <span class="kc">True</span>
+</span><span id="__span-0-25"><a id="__codelineno-0-25" name="__codelineno-0-25" href="#__codelineno-0-25"></a>    <span class="n">text_column_name</span> <span class="o">=</span> <span class="s2">&quot;text&quot;</span>
+</span><span id="__span-0-26"><a id="__codelineno-0-26" name="__codelineno-0-26" href="#__codelineno-0-26"></a>    <span class="n">title_column_name</span> <span class="o">=</span> <span class="s2">&quot;short_book_title&quot;</span>
+</span></code></pre></div>
+<h3 id="csv-dataset">CSV dataset</h3>
+<p>Other datasets may require implementing the full text extraction logic. The example below reads text data from CSV files while excluding specific subsets:</p>
+<div class="language-python highlight"><pre><span></span><code><span id="__span-1-1"><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a><span class="c1"># my_datasets/csv_example.py</span>
+</span><span id="__span-1-2"><a id="__codelineno-1-2" name="__codelineno-1-2" href="#__codelineno-1-2"></a>
+</span><span id="__span-1-3"><a id="__codelineno-1-3" name="__codelineno-1-3" href="#__codelineno-1-3"></a><span class="kn">import</span> <span class="nn">logging</span>
+</span><span id="__span-1-4"><a id="__codelineno-1-4" name="__codelineno-1-4" href="#__codelineno-1-4"></a><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
+</span><span id="__span-1-5"><a id="__codelineno-1-5" name="__codelineno-1-5" href="#__codelineno-1-5"></a><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
+</span><span id="__span-1-6"><a id="__codelineno-1-6" name="__codelineno-1-6" href="#__codelineno-1-6"></a><span class="kn">from</span> <span class="nn">llm_datasets.datasets.base</span> <span class="kn">import</span> <span class="n">BaseDataset</span><span class="p">,</span> <span class="n">Availability</span><span class="p">,</span> <span class="n">License</span>
+</span><span id="__span-1-7"><a id="__codelineno-1-7" name="__codelineno-1-7" href="#__codelineno-1-7"></a>
+</span><span id="__span-1-8"><a id="__codelineno-1-8" name="__codelineno-1-8" href="#__codelineno-1-8"></a><span class="n">logger</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">getLogger</span><span class="p">(</span><span class="vm">__name__</span><span class="p">)</span>
+</span><span id="__span-1-9"><a id="__codelineno-1-9" name="__codelineno-1-9" href="#__codelineno-1-9"></a>
+</span><span id="__span-1-10"><a id="__codelineno-1-10" name="__codelineno-1-10" href="#__codelineno-1-10"></a>
+</span><span id="__span-1-11"><a id="__codelineno-1-11" name="__codelineno-1-11" href="#__codelineno-1-11"></a><span class="k">class</span> <span class="nc">CSVExampleDataset</span><span class="p">(</span><span class="n">BaseDataset</span><span class="p">):</span>
+</span><span id="__span-1-12"><a id="__codelineno-1-12" name="__codelineno-1-12" href="#__codelineno-1-12"></a>    <span class="n">DATASET_ID</span> <span class="o">=</span> <span class="s2">&quot;csv_example&quot;</span>
+</span><span id="__span-1-13"><a id="__codelineno-1-13" name="__codelineno-1-13" href="#__codelineno-1-13"></a>    <span class="n">TITLE</span> <span class="o">=</span> <span class="s2">&quot;An example for a dataset from CSV files&quot;</span>
+</span><span id="__span-1-14"><a id="__codelineno-1-14" name="__codelineno-1-14" href="#__codelineno-1-14"></a>    <span class="n">AVAILIBITY</span> <span class="o">=</span> <span class="n">Availability</span><span class="o">.</span><span class="n">ON_REQUEST</span>
+</span><span id="__span-1-15"><a id="__codelineno-1-15" name="__codelineno-1-15" href="#__codelineno-1-15"></a>    <span class="n">LANGUAGES</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;en&quot;</span><span class="p">]</span>
+</span><span id="__span-1-16"><a id="__codelineno-1-16" name="__codelineno-1-16" href="#__codelineno-1-16"></a>    <span class="n">LICENSE</span> <span class="o">=</span> <span class="n">License</span><span class="p">(</span><span class="s2">&quot;mixed&quot;</span><span class="p">)</span>
+</span><span id="__span-1-17"><a id="__codelineno-1-17" name="__codelineno-1-17" href="#__codelineno-1-17"></a>
+</span><span id="__span-1-18"><a id="__codelineno-1-18" name="__codelineno-1-18" href="#__codelineno-1-18"></a>    <span class="k">def</span> <span class="nf">get_texts</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-1-19"><a id="__codelineno-1-19" name="__codelineno-1-19" href="#__codelineno-1-19"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+</span><span id="__span-1-20"><a id="__codelineno-1-20" name="__codelineno-1-20" href="#__codelineno-1-20"></a><span class="sd">        Extract texts from CSV files (format: &quot;documen_id,text,score,url&quot;)</span>
+</span><span id="__span-1-21"><a id="__codelineno-1-21" name="__codelineno-1-21" href="#__codelineno-1-21"></a><span class="sd">        &quot;&quot;&quot;</span>
+</span><span id="__span-1-22"><a id="__codelineno-1-22" name="__codelineno-1-22" href="#__codelineno-1-22"></a>        <span class="c1"># Iterate over CSV files in raw dataset directory</span>
+</span><span id="__span-1-23"><a id="__codelineno-1-23" name="__codelineno-1-23" href="#__codelineno-1-23"></a>        <span class="k">for</span> <span class="n">file_path</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_dataset_file_paths</span><span class="p">(</span><span class="n">needed_suffix</span><span class="o">=</span><span class="s2">&quot;.csv&quot;</span><span class="p">):</span>
+</span><span id="__span-1-24"><a id="__codelineno-1-24" name="__codelineno-1-24" href="#__codelineno-1-24"></a>            <span class="n">file_name</span> <span class="o">=</span> <span class="n">Path</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span><span class="o">.</span><span class="n">name</span>
+</span><span id="__span-1-25"><a id="__codelineno-1-25" name="__codelineno-1-25" href="#__codelineno-1-25"></a>
+</span><span id="__span-1-26"><a id="__codelineno-1-26" name="__codelineno-1-26" href="#__codelineno-1-26"></a>            <span class="k">if</span> <span class="p">(</span>
+</span><span id="__span-1-27"><a id="__codelineno-1-27" name="__codelineno-1-27" href="#__codelineno-1-27"></a>                <span class="n">file_name</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">&quot;mc4_&quot;</span><span class="p">)</span>
+</span><span id="__span-1-28"><a id="__codelineno-1-28" name="__codelineno-1-28" href="#__codelineno-1-28"></a>                <span class="ow">or</span> <span class="n">file_name</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">&quot;colossal-oscar-&quot;</span><span class="p">)</span>
+</span><span id="__span-1-29"><a id="__codelineno-1-29" name="__codelineno-1-29" href="#__codelineno-1-29"></a>                <span class="ow">or</span> <span class="n">file_name</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">&quot;wikimedia&quot;</span><span class="p">)</span>
+</span><span id="__span-1-30"><a id="__codelineno-1-30" name="__codelineno-1-30" href="#__codelineno-1-30"></a>            <span class="p">):</span>
+</span><span id="__span-1-31"><a id="__codelineno-1-31" name="__codelineno-1-31" href="#__codelineno-1-31"></a>                <span class="c1"># skip subsets that overlap with other datasets (baes on file name)</span>
+</span><span id="__span-1-32"><a id="__codelineno-1-32" name="__codelineno-1-32" href="#__codelineno-1-32"></a>                <span class="k">continue</span>
+</span><span id="__span-1-33"><a id="__codelineno-1-33" name="__codelineno-1-33" href="#__codelineno-1-33"></a>
+</span><span id="__span-1-34"><a id="__codelineno-1-34" name="__codelineno-1-34" href="#__codelineno-1-34"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Reading CSV: </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">file_path</span><span class="p">)</span>
+</span><span id="__span-1-35"><a id="__codelineno-1-35" name="__codelineno-1-35" href="#__codelineno-1-35"></a>            <span class="k">try</span><span class="p">:</span>
+</span><span id="__span-1-36"><a id="__codelineno-1-36" name="__codelineno-1-36" href="#__codelineno-1-36"></a>                <span class="c1"># Use chunks to reduce memory consumption</span>
+</span><span id="__span-1-37"><a id="__codelineno-1-37" name="__codelineno-1-37" href="#__codelineno-1-37"></a>                <span class="k">for</span> <span class="n">df</span> <span class="ow">in</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">file_path</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s2">&quot;,&quot;</span><span class="p">,</span> <span class="n">chunksize</span><span class="o">=</span><span class="mi">10_000</span><span class="p">):</span>
+</span><span id="__span-1-38"><a id="__codelineno-1-38" name="__codelineno-1-38" href="#__codelineno-1-38"></a>                    <span class="k">for</span> <span class="n">text</span> <span class="ow">in</span> <span class="n">df</span><span class="o">.</span><span class="n">text</span><span class="o">.</span><span class="n">values</span><span class="p">:</span>
+</span><span id="__span-1-39"><a id="__codelineno-1-39" name="__codelineno-1-39" href="#__codelineno-1-39"></a>                        <span class="c1"># Pass extracted text</span>
+</span><span id="__span-1-40"><a id="__codelineno-1-40" name="__codelineno-1-40" href="#__codelineno-1-40"></a>                        <span class="k">yield</span> <span class="n">text</span>
+</span><span id="__span-1-41"><a id="__codelineno-1-41" name="__codelineno-1-41" href="#__codelineno-1-41"></a>            <span class="k">except</span> <span class="ne">ValueError</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
+</span><span id="__span-1-42"><a id="__codelineno-1-42" name="__codelineno-1-42" href="#__codelineno-1-42"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="s2">&quot;Error in file </span><span class="si">%s</span><span class="s2">; error = </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">file_path</span><span class="p">,</span> <span class="n">e</span><span class="p">)</span>
+</span></code></pre></div>
+<h2 id="register-new-dataset-classes">Register new dataset classes</h2>
+<p>Each dataset class needs to be registered with <code>llm-datasets</code> such that the commands know what classes are available.
+This can be done by making a new Python module with a <code>get_registered_dataset_classes</code> method that returns a list of dataset classes:</p>
+<div class="language-python highlight"><pre><span></span><code><span id="__span-2-1"><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="c1"># my_datasets/dataset_registry.py</span>
+</span><span id="__span-2-2"><a id="__codelineno-2-2" name="__codelineno-2-2" href="#__codelineno-2-2"></a><span class="kn">from</span> <span class="nn">my_datasets.pg19</span> <span class="kn">import</span> <span class="n">PG19Dataset</span>
+</span><span id="__span-2-3"><a id="__codelineno-2-3" name="__codelineno-2-3" href="#__codelineno-2-3"></a>
+</span><span id="__span-2-4"><a id="__codelineno-2-4" name="__codelineno-2-4" href="#__codelineno-2-4"></a><span class="k">def</span> <span class="nf">get_registered_dataset_classes</span><span class="p">():</span>
+</span><span id="__span-2-5"><a id="__codelineno-2-5" name="__codelineno-2-5" href="#__codelineno-2-5"></a>    <span class="k">return</span> <span class="p">[</span>
+</span><span id="__span-2-6"><a id="__codelineno-2-6" name="__codelineno-2-6" href="#__codelineno-2-6"></a>        <span class="n">PG19Dataset</span><span class="p">,</span>
+</span><span id="__span-2-7"><a id="__codelineno-2-7" name="__codelineno-2-7" href="#__codelineno-2-7"></a>    <span class="p">]</span>
+</span></code></pre></div>
+<h2 id="load-registry-in-commands">Load registry in commands</h2>
+<p>To load the registerd datasets in the pipeline commands, you need to specify the <code>--extra_dataset_registries</code> argument:</p>
+<div class="language-bash highlight"><pre><span></span><code><span id="__span-3-1"><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a>llm-datasets<span class="w"> </span>compose<span class="w"> </span>...<span class="w"> </span>-extra_dataset_registries<span class="o">=</span>my_datasets.dataset_registry
+</span></code></pre></div>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "..", "features": [], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/api/base_dataset/index.html b/api/base_dataset/index.html
new file mode 100644
index 0000000..e0f9ff0
--- /dev/null
+++ b/api/base_dataset/index.html
@@ -0,0 +1,3901 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/api/base_dataset/">
+      
+      
+        <link rel="prev" href="../../related-work/">
+      
+      
+        <link rel="next" href="../hf_dataset/">
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>BaseDataset - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#basedataset" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              BaseDataset
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../datasets/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" checked>
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+        
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset" class="md-nav__link">
+    <span class="md-ellipsis">
+      BaseDataset
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.filter_documents" class="md-nav__link">
+    <span class="md-ellipsis">
+      filter_documents
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.filter_texts" class="md-nav__link">
+    <span class="md-ellipsis">
+      filter_texts
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.generate_texts_from_output" class="md-nav__link">
+    <span class="md-ellipsis">
+      generate_texts_from_output
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.get_compression_from_output_files" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_compression_from_output_files
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.get_estimated_bytes_from_output" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_estimated_bytes_from_output
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.get_output_rows_count" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_output_rows_count
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.get_sampling_factor" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_sampling_factor
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.is_selected" class="md-nav__link">
+    <span class="md-ellipsis">
+      is_selected
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.save_stats" class="md-nav__link">
+    <span class="md-ellipsis">
+      save_stats
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.save_texts" class="md-nav__link">
+    <span class="md-ellipsis">
+      save_texts
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.save_texts_to_jsonl" class="md-nav__link">
+    <span class="md-ellipsis">
+      save_texts_to_jsonl
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.save_texts_to_parquet" class="md-nav__link">
+    <span class="md-ellipsis">
+      save_texts_to_parquet
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset" class="md-nav__link">
+    <span class="md-ellipsis">
+      BaseDataset
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.filter_documents" class="md-nav__link">
+    <span class="md-ellipsis">
+      filter_documents
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.filter_texts" class="md-nav__link">
+    <span class="md-ellipsis">
+      filter_texts
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.generate_texts_from_output" class="md-nav__link">
+    <span class="md-ellipsis">
+      generate_texts_from_output
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.get_compression_from_output_files" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_compression_from_output_files
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.get_estimated_bytes_from_output" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_estimated_bytes_from_output
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.get_output_rows_count" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_output_rows_count
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.get_sampling_factor" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_sampling_factor
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.is_selected" class="md-nav__link">
+    <span class="md-ellipsis">
+      is_selected
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.save_stats" class="md-nav__link">
+    <span class="md-ellipsis">
+      save_stats
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.save_texts" class="md-nav__link">
+    <span class="md-ellipsis">
+      save_texts
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.save_texts_to_jsonl" class="md-nav__link">
+    <span class="md-ellipsis">
+      save_texts_to_jsonl
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.base.BaseDataset.save_texts_to_parquet" class="md-nav__link">
+    <span class="md-ellipsis">
+      save_texts_to_parquet
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="basedataset">BaseDataset</h1>
+
+
+<div class="doc doc-object doc-class">
+
+
+
+<a id="llm_datasets.datasets.base.BaseDataset"></a>
+    <div class="doc doc-contents first">
+            <p class="doc doc-class-bases">
+              Bases: <code>object</code></p>
+
+
+      <p>Base class for all datasets. It implements all generic loading, processing, and writing methods.</p>
+
+              <details class="quote">
+                <summary>Source code in <code>src/llm_datasets/datasets/base.py</code></summary>
+                <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-128"> 128</a></span>
+<span class="normal"><a href="#__codelineno-0-129"> 129</a></span>
+<span class="normal"><a href="#__codelineno-0-130"> 130</a></span>
+<span class="normal"><a href="#__codelineno-0-131"> 131</a></span>
+<span class="normal"><a href="#__codelineno-0-132"> 132</a></span>
+<span class="normal"><a href="#__codelineno-0-133"> 133</a></span>
+<span class="normal"><a href="#__codelineno-0-134"> 134</a></span>
+<span class="normal"><a href="#__codelineno-0-135"> 135</a></span>
+<span class="normal"><a href="#__codelineno-0-136"> 136</a></span>
+<span class="normal"><a href="#__codelineno-0-137"> 137</a></span>
+<span class="normal"><a href="#__codelineno-0-138"> 138</a></span>
+<span class="normal"><a href="#__codelineno-0-139"> 139</a></span>
+<span class="normal"><a href="#__codelineno-0-140"> 140</a></span>
+<span class="normal"><a href="#__codelineno-0-141"> 141</a></span>
+<span class="normal"><a href="#__codelineno-0-142"> 142</a></span>
+<span class="normal"><a href="#__codelineno-0-143"> 143</a></span>
+<span class="normal"><a href="#__codelineno-0-144"> 144</a></span>
+<span class="normal"><a href="#__codelineno-0-145"> 145</a></span>
+<span class="normal"><a href="#__codelineno-0-146"> 146</a></span>
+<span class="normal"><a href="#__codelineno-0-147"> 147</a></span>
+<span class="normal"><a href="#__codelineno-0-148"> 148</a></span>
+<span class="normal"><a href="#__codelineno-0-149"> 149</a></span>
+<span class="normal"><a href="#__codelineno-0-150"> 150</a></span>
+<span class="normal"><a href="#__codelineno-0-151"> 151</a></span>
+<span class="normal"><a href="#__codelineno-0-152"> 152</a></span>
+<span class="normal"><a href="#__codelineno-0-153"> 153</a></span>
+<span class="normal"><a href="#__codelineno-0-154"> 154</a></span>
+<span class="normal"><a href="#__codelineno-0-155"> 155</a></span>
+<span class="normal"><a href="#__codelineno-0-156"> 156</a></span>
+<span class="normal"><a href="#__codelineno-0-157"> 157</a></span>
+<span class="normal"><a href="#__codelineno-0-158"> 158</a></span>
+<span class="normal"><a href="#__codelineno-0-159"> 159</a></span>
+<span class="normal"><a href="#__codelineno-0-160"> 160</a></span>
+<span class="normal"><a href="#__codelineno-0-161"> 161</a></span>
+<span class="normal"><a href="#__codelineno-0-162"> 162</a></span>
+<span class="normal"><a href="#__codelineno-0-163"> 163</a></span>
+<span class="normal"><a href="#__codelineno-0-164"> 164</a></span>
+<span class="normal"><a href="#__codelineno-0-165"> 165</a></span>
+<span class="normal"><a href="#__codelineno-0-166"> 166</a></span>
+<span class="normal"><a href="#__codelineno-0-167"> 167</a></span>
+<span class="normal"><a href="#__codelineno-0-168"> 168</a></span>
+<span class="normal"><a href="#__codelineno-0-169"> 169</a></span>
+<span class="normal"><a href="#__codelineno-0-170"> 170</a></span>
+<span class="normal"><a href="#__codelineno-0-171"> 171</a></span>
+<span class="normal"><a href="#__codelineno-0-172"> 172</a></span>
+<span class="normal"><a href="#__codelineno-0-173"> 173</a></span>
+<span class="normal"><a href="#__codelineno-0-174"> 174</a></span>
+<span class="normal"><a href="#__codelineno-0-175"> 175</a></span>
+<span class="normal"><a href="#__codelineno-0-176"> 176</a></span>
+<span class="normal"><a href="#__codelineno-0-177"> 177</a></span>
+<span class="normal"><a href="#__codelineno-0-178"> 178</a></span>
+<span class="normal"><a href="#__codelineno-0-179"> 179</a></span>
+<span class="normal"><a href="#__codelineno-0-180"> 180</a></span>
+<span class="normal"><a href="#__codelineno-0-181"> 181</a></span>
+<span class="normal"><a href="#__codelineno-0-182"> 182</a></span>
+<span class="normal"><a href="#__codelineno-0-183"> 183</a></span>
+<span class="normal"><a href="#__codelineno-0-184"> 184</a></span>
+<span class="normal"><a href="#__codelineno-0-185"> 185</a></span>
+<span class="normal"><a href="#__codelineno-0-186"> 186</a></span>
+<span class="normal"><a href="#__codelineno-0-187"> 187</a></span>
+<span class="normal"><a href="#__codelineno-0-188"> 188</a></span>
+<span class="normal"><a href="#__codelineno-0-189"> 189</a></span>
+<span class="normal"><a href="#__codelineno-0-190"> 190</a></span>
+<span class="normal"><a href="#__codelineno-0-191"> 191</a></span>
+<span class="normal"><a href="#__codelineno-0-192"> 192</a></span>
+<span class="normal"><a href="#__codelineno-0-193"> 193</a></span>
+<span class="normal"><a href="#__codelineno-0-194"> 194</a></span>
+<span class="normal"><a href="#__codelineno-0-195"> 195</a></span>
+<span class="normal"><a href="#__codelineno-0-196"> 196</a></span>
+<span class="normal"><a href="#__codelineno-0-197"> 197</a></span>
+<span class="normal"><a href="#__codelineno-0-198"> 198</a></span>
+<span class="normal"><a href="#__codelineno-0-199"> 199</a></span>
+<span class="normal"><a href="#__codelineno-0-200"> 200</a></span>
+<span class="normal"><a href="#__codelineno-0-201"> 201</a></span>
+<span class="normal"><a href="#__codelineno-0-202"> 202</a></span>
+<span class="normal"><a href="#__codelineno-0-203"> 203</a></span>
+<span class="normal"><a href="#__codelineno-0-204"> 204</a></span>
+<span class="normal"><a href="#__codelineno-0-205"> 205</a></span>
+<span class="normal"><a href="#__codelineno-0-206"> 206</a></span>
+<span class="normal"><a href="#__codelineno-0-207"> 207</a></span>
+<span class="normal"><a href="#__codelineno-0-208"> 208</a></span>
+<span class="normal"><a href="#__codelineno-0-209"> 209</a></span>
+<span class="normal"><a href="#__codelineno-0-210"> 210</a></span>
+<span class="normal"><a href="#__codelineno-0-211"> 211</a></span>
+<span class="normal"><a href="#__codelineno-0-212"> 212</a></span>
+<span class="normal"><a href="#__codelineno-0-213"> 213</a></span>
+<span class="normal"><a href="#__codelineno-0-214"> 214</a></span>
+<span class="normal"><a href="#__codelineno-0-215"> 215</a></span>
+<span class="normal"><a href="#__codelineno-0-216"> 216</a></span>
+<span class="normal"><a href="#__codelineno-0-217"> 217</a></span>
+<span class="normal"><a href="#__codelineno-0-218"> 218</a></span>
+<span class="normal"><a href="#__codelineno-0-219"> 219</a></span>
+<span class="normal"><a href="#__codelineno-0-220"> 220</a></span>
+<span class="normal"><a href="#__codelineno-0-221"> 221</a></span>
+<span class="normal"><a href="#__codelineno-0-222"> 222</a></span>
+<span class="normal"><a href="#__codelineno-0-223"> 223</a></span>
+<span class="normal"><a href="#__codelineno-0-224"> 224</a></span>
+<span class="normal"><a href="#__codelineno-0-225"> 225</a></span>
+<span class="normal"><a href="#__codelineno-0-226"> 226</a></span>
+<span class="normal"><a href="#__codelineno-0-227"> 227</a></span>
+<span class="normal"><a href="#__codelineno-0-228"> 228</a></span>
+<span class="normal"><a href="#__codelineno-0-229"> 229</a></span>
+<span class="normal"><a href="#__codelineno-0-230"> 230</a></span>
+<span class="normal"><a href="#__codelineno-0-231"> 231</a></span>
+<span class="normal"><a href="#__codelineno-0-232"> 232</a></span>
+<span class="normal"><a href="#__codelineno-0-233"> 233</a></span>
+<span class="normal"><a href="#__codelineno-0-234"> 234</a></span>
+<span class="normal"><a href="#__codelineno-0-235"> 235</a></span>
+<span class="normal"><a href="#__codelineno-0-236"> 236</a></span>
+<span class="normal"><a href="#__codelineno-0-237"> 237</a></span>
+<span class="normal"><a href="#__codelineno-0-238"> 238</a></span>
+<span class="normal"><a href="#__codelineno-0-239"> 239</a></span>
+<span class="normal"><a href="#__codelineno-0-240"> 240</a></span>
+<span class="normal"><a href="#__codelineno-0-241"> 241</a></span>
+<span class="normal"><a href="#__codelineno-0-242"> 242</a></span>
+<span class="normal"><a href="#__codelineno-0-243"> 243</a></span>
+<span class="normal"><a href="#__codelineno-0-244"> 244</a></span>
+<span class="normal"><a href="#__codelineno-0-245"> 245</a></span>
+<span class="normal"><a href="#__codelineno-0-246"> 246</a></span>
+<span class="normal"><a href="#__codelineno-0-247"> 247</a></span>
+<span class="normal"><a href="#__codelineno-0-248"> 248</a></span>
+<span class="normal"><a href="#__codelineno-0-249"> 249</a></span>
+<span class="normal"><a href="#__codelineno-0-250"> 250</a></span>
+<span class="normal"><a href="#__codelineno-0-251"> 251</a></span>
+<span class="normal"><a href="#__codelineno-0-252"> 252</a></span>
+<span class="normal"><a href="#__codelineno-0-253"> 253</a></span>
+<span class="normal"><a href="#__codelineno-0-254"> 254</a></span>
+<span class="normal"><a href="#__codelineno-0-255"> 255</a></span>
+<span class="normal"><a href="#__codelineno-0-256"> 256</a></span>
+<span class="normal"><a href="#__codelineno-0-257"> 257</a></span>
+<span class="normal"><a href="#__codelineno-0-258"> 258</a></span>
+<span class="normal"><a href="#__codelineno-0-259"> 259</a></span>
+<span class="normal"><a href="#__codelineno-0-260"> 260</a></span>
+<span class="normal"><a href="#__codelineno-0-261"> 261</a></span>
+<span class="normal"><a href="#__codelineno-0-262"> 262</a></span>
+<span class="normal"><a href="#__codelineno-0-263"> 263</a></span>
+<span class="normal"><a href="#__codelineno-0-264"> 264</a></span>
+<span class="normal"><a href="#__codelineno-0-265"> 265</a></span>
+<span class="normal"><a href="#__codelineno-0-266"> 266</a></span>
+<span class="normal"><a href="#__codelineno-0-267"> 267</a></span>
+<span class="normal"><a href="#__codelineno-0-268"> 268</a></span>
+<span class="normal"><a href="#__codelineno-0-269"> 269</a></span>
+<span class="normal"><a href="#__codelineno-0-270"> 270</a></span>
+<span class="normal"><a href="#__codelineno-0-271"> 271</a></span>
+<span class="normal"><a href="#__codelineno-0-272"> 272</a></span>
+<span class="normal"><a href="#__codelineno-0-273"> 273</a></span>
+<span class="normal"><a href="#__codelineno-0-274"> 274</a></span>
+<span class="normal"><a href="#__codelineno-0-275"> 275</a></span>
+<span class="normal"><a href="#__codelineno-0-276"> 276</a></span>
+<span class="normal"><a href="#__codelineno-0-277"> 277</a></span>
+<span class="normal"><a href="#__codelineno-0-278"> 278</a></span>
+<span class="normal"><a href="#__codelineno-0-279"> 279</a></span>
+<span class="normal"><a href="#__codelineno-0-280"> 280</a></span>
+<span class="normal"><a href="#__codelineno-0-281"> 281</a></span>
+<span class="normal"><a href="#__codelineno-0-282"> 282</a></span>
+<span class="normal"><a href="#__codelineno-0-283"> 283</a></span>
+<span class="normal"><a href="#__codelineno-0-284"> 284</a></span>
+<span class="normal"><a href="#__codelineno-0-285"> 285</a></span>
+<span class="normal"><a href="#__codelineno-0-286"> 286</a></span>
+<span class="normal"><a href="#__codelineno-0-287"> 287</a></span>
+<span class="normal"><a href="#__codelineno-0-288"> 288</a></span>
+<span class="normal"><a href="#__codelineno-0-289"> 289</a></span>
+<span class="normal"><a href="#__codelineno-0-290"> 290</a></span>
+<span class="normal"><a href="#__codelineno-0-291"> 291</a></span>
+<span class="normal"><a href="#__codelineno-0-292"> 292</a></span>
+<span class="normal"><a href="#__codelineno-0-293"> 293</a></span>
+<span class="normal"><a href="#__codelineno-0-294"> 294</a></span>
+<span class="normal"><a href="#__codelineno-0-295"> 295</a></span>
+<span class="normal"><a href="#__codelineno-0-296"> 296</a></span>
+<span class="normal"><a href="#__codelineno-0-297"> 297</a></span>
+<span class="normal"><a href="#__codelineno-0-298"> 298</a></span>
+<span class="normal"><a href="#__codelineno-0-299"> 299</a></span>
+<span class="normal"><a href="#__codelineno-0-300"> 300</a></span>
+<span class="normal"><a href="#__codelineno-0-301"> 301</a></span>
+<span class="normal"><a href="#__codelineno-0-302"> 302</a></span>
+<span class="normal"><a href="#__codelineno-0-303"> 303</a></span>
+<span class="normal"><a href="#__codelineno-0-304"> 304</a></span>
+<span class="normal"><a href="#__codelineno-0-305"> 305</a></span>
+<span class="normal"><a href="#__codelineno-0-306"> 306</a></span>
+<span class="normal"><a href="#__codelineno-0-307"> 307</a></span>
+<span class="normal"><a href="#__codelineno-0-308"> 308</a></span>
+<span class="normal"><a href="#__codelineno-0-309"> 309</a></span>
+<span class="normal"><a href="#__codelineno-0-310"> 310</a></span>
+<span class="normal"><a href="#__codelineno-0-311"> 311</a></span>
+<span class="normal"><a href="#__codelineno-0-312"> 312</a></span>
+<span class="normal"><a href="#__codelineno-0-313"> 313</a></span>
+<span class="normal"><a href="#__codelineno-0-314"> 314</a></span>
+<span class="normal"><a href="#__codelineno-0-315"> 315</a></span>
+<span class="normal"><a href="#__codelineno-0-316"> 316</a></span>
+<span class="normal"><a href="#__codelineno-0-317"> 317</a></span>
+<span class="normal"><a href="#__codelineno-0-318"> 318</a></span>
+<span class="normal"><a href="#__codelineno-0-319"> 319</a></span>
+<span class="normal"><a href="#__codelineno-0-320"> 320</a></span>
+<span class="normal"><a href="#__codelineno-0-321"> 321</a></span>
+<span class="normal"><a href="#__codelineno-0-322"> 322</a></span>
+<span class="normal"><a href="#__codelineno-0-323"> 323</a></span>
+<span class="normal"><a href="#__codelineno-0-324"> 324</a></span>
+<span class="normal"><a href="#__codelineno-0-325"> 325</a></span>
+<span class="normal"><a href="#__codelineno-0-326"> 326</a></span>
+<span class="normal"><a href="#__codelineno-0-327"> 327</a></span>
+<span class="normal"><a href="#__codelineno-0-328"> 328</a></span>
+<span class="normal"><a href="#__codelineno-0-329"> 329</a></span>
+<span class="normal"><a href="#__codelineno-0-330"> 330</a></span>
+<span class="normal"><a href="#__codelineno-0-331"> 331</a></span>
+<span class="normal"><a href="#__codelineno-0-332"> 332</a></span>
+<span class="normal"><a href="#__codelineno-0-333"> 333</a></span>
+<span class="normal"><a href="#__codelineno-0-334"> 334</a></span>
+<span class="normal"><a href="#__codelineno-0-335"> 335</a></span>
+<span class="normal"><a href="#__codelineno-0-336"> 336</a></span>
+<span class="normal"><a href="#__codelineno-0-337"> 337</a></span>
+<span class="normal"><a href="#__codelineno-0-338"> 338</a></span>
+<span class="normal"><a href="#__codelineno-0-339"> 339</a></span>
+<span class="normal"><a href="#__codelineno-0-340"> 340</a></span>
+<span class="normal"><a href="#__codelineno-0-341"> 341</a></span>
+<span class="normal"><a href="#__codelineno-0-342"> 342</a></span>
+<span class="normal"><a href="#__codelineno-0-343"> 343</a></span>
+<span class="normal"><a href="#__codelineno-0-344"> 344</a></span>
+<span class="normal"><a href="#__codelineno-0-345"> 345</a></span>
+<span class="normal"><a href="#__codelineno-0-346"> 346</a></span>
+<span class="normal"><a href="#__codelineno-0-347"> 347</a></span>
+<span class="normal"><a href="#__codelineno-0-348"> 348</a></span>
+<span class="normal"><a href="#__codelineno-0-349"> 349</a></span>
+<span class="normal"><a href="#__codelineno-0-350"> 350</a></span>
+<span class="normal"><a href="#__codelineno-0-351"> 351</a></span>
+<span class="normal"><a href="#__codelineno-0-352"> 352</a></span>
+<span class="normal"><a href="#__codelineno-0-353"> 353</a></span>
+<span class="normal"><a href="#__codelineno-0-354"> 354</a></span>
+<span class="normal"><a href="#__codelineno-0-355"> 355</a></span>
+<span class="normal"><a href="#__codelineno-0-356"> 356</a></span>
+<span class="normal"><a href="#__codelineno-0-357"> 357</a></span>
+<span class="normal"><a href="#__codelineno-0-358"> 358</a></span>
+<span class="normal"><a href="#__codelineno-0-359"> 359</a></span>
+<span class="normal"><a href="#__codelineno-0-360"> 360</a></span>
+<span class="normal"><a href="#__codelineno-0-361"> 361</a></span>
+<span class="normal"><a href="#__codelineno-0-362"> 362</a></span>
+<span class="normal"><a href="#__codelineno-0-363"> 363</a></span>
+<span class="normal"><a href="#__codelineno-0-364"> 364</a></span>
+<span class="normal"><a href="#__codelineno-0-365"> 365</a></span>
+<span class="normal"><a href="#__codelineno-0-366"> 366</a></span>
+<span class="normal"><a href="#__codelineno-0-367"> 367</a></span>
+<span class="normal"><a href="#__codelineno-0-368"> 368</a></span>
+<span class="normal"><a href="#__codelineno-0-369"> 369</a></span>
+<span class="normal"><a href="#__codelineno-0-370"> 370</a></span>
+<span class="normal"><a href="#__codelineno-0-371"> 371</a></span>
+<span class="normal"><a href="#__codelineno-0-372"> 372</a></span>
+<span class="normal"><a href="#__codelineno-0-373"> 373</a></span>
+<span class="normal"><a href="#__codelineno-0-374"> 374</a></span>
+<span class="normal"><a href="#__codelineno-0-375"> 375</a></span>
+<span class="normal"><a href="#__codelineno-0-376"> 376</a></span>
+<span class="normal"><a href="#__codelineno-0-377"> 377</a></span>
+<span class="normal"><a href="#__codelineno-0-378"> 378</a></span>
+<span class="normal"><a href="#__codelineno-0-379"> 379</a></span>
+<span class="normal"><a href="#__codelineno-0-380"> 380</a></span>
+<span class="normal"><a href="#__codelineno-0-381"> 381</a></span>
+<span class="normal"><a href="#__codelineno-0-382"> 382</a></span>
+<span class="normal"><a href="#__codelineno-0-383"> 383</a></span>
+<span class="normal"><a href="#__codelineno-0-384"> 384</a></span>
+<span class="normal"><a href="#__codelineno-0-385"> 385</a></span>
+<span class="normal"><a href="#__codelineno-0-386"> 386</a></span>
+<span class="normal"><a href="#__codelineno-0-387"> 387</a></span>
+<span class="normal"><a href="#__codelineno-0-388"> 388</a></span>
+<span class="normal"><a href="#__codelineno-0-389"> 389</a></span>
+<span class="normal"><a href="#__codelineno-0-390"> 390</a></span>
+<span class="normal"><a href="#__codelineno-0-391"> 391</a></span>
+<span class="normal"><a href="#__codelineno-0-392"> 392</a></span>
+<span class="normal"><a href="#__codelineno-0-393"> 393</a></span>
+<span class="normal"><a href="#__codelineno-0-394"> 394</a></span>
+<span class="normal"><a href="#__codelineno-0-395"> 395</a></span>
+<span class="normal"><a href="#__codelineno-0-396"> 396</a></span>
+<span class="normal"><a href="#__codelineno-0-397"> 397</a></span>
+<span class="normal"><a href="#__codelineno-0-398"> 398</a></span>
+<span class="normal"><a href="#__codelineno-0-399"> 399</a></span>
+<span class="normal"><a href="#__codelineno-0-400"> 400</a></span>
+<span class="normal"><a href="#__codelineno-0-401"> 401</a></span>
+<span class="normal"><a href="#__codelineno-0-402"> 402</a></span>
+<span class="normal"><a href="#__codelineno-0-403"> 403</a></span>
+<span class="normal"><a href="#__codelineno-0-404"> 404</a></span>
+<span class="normal"><a href="#__codelineno-0-405"> 405</a></span>
+<span class="normal"><a href="#__codelineno-0-406"> 406</a></span>
+<span class="normal"><a href="#__codelineno-0-407"> 407</a></span>
+<span class="normal"><a href="#__codelineno-0-408"> 408</a></span>
+<span class="normal"><a href="#__codelineno-0-409"> 409</a></span>
+<span class="normal"><a href="#__codelineno-0-410"> 410</a></span>
+<span class="normal"><a href="#__codelineno-0-411"> 411</a></span>
+<span class="normal"><a href="#__codelineno-0-412"> 412</a></span>
+<span class="normal"><a href="#__codelineno-0-413"> 413</a></span>
+<span class="normal"><a href="#__codelineno-0-414"> 414</a></span>
+<span class="normal"><a href="#__codelineno-0-415"> 415</a></span>
+<span class="normal"><a href="#__codelineno-0-416"> 416</a></span>
+<span class="normal"><a href="#__codelineno-0-417"> 417</a></span>
+<span class="normal"><a href="#__codelineno-0-418"> 418</a></span>
+<span class="normal"><a href="#__codelineno-0-419"> 419</a></span>
+<span class="normal"><a href="#__codelineno-0-420"> 420</a></span>
+<span class="normal"><a href="#__codelineno-0-421"> 421</a></span>
+<span class="normal"><a href="#__codelineno-0-422"> 422</a></span>
+<span class="normal"><a href="#__codelineno-0-423"> 423</a></span>
+<span class="normal"><a href="#__codelineno-0-424"> 424</a></span>
+<span class="normal"><a href="#__codelineno-0-425"> 425</a></span>
+<span class="normal"><a href="#__codelineno-0-426"> 426</a></span>
+<span class="normal"><a href="#__codelineno-0-427"> 427</a></span>
+<span class="normal"><a href="#__codelineno-0-428"> 428</a></span>
+<span class="normal"><a href="#__codelineno-0-429"> 429</a></span>
+<span class="normal"><a href="#__codelineno-0-430"> 430</a></span>
+<span class="normal"><a href="#__codelineno-0-431"> 431</a></span>
+<span class="normal"><a href="#__codelineno-0-432"> 432</a></span>
+<span class="normal"><a href="#__codelineno-0-433"> 433</a></span>
+<span class="normal"><a href="#__codelineno-0-434"> 434</a></span>
+<span class="normal"><a href="#__codelineno-0-435"> 435</a></span>
+<span class="normal"><a href="#__codelineno-0-436"> 436</a></span>
+<span class="normal"><a href="#__codelineno-0-437"> 437</a></span>
+<span class="normal"><a href="#__codelineno-0-438"> 438</a></span>
+<span class="normal"><a href="#__codelineno-0-439"> 439</a></span>
+<span class="normal"><a href="#__codelineno-0-440"> 440</a></span>
+<span class="normal"><a href="#__codelineno-0-441"> 441</a></span>
+<span class="normal"><a href="#__codelineno-0-442"> 442</a></span>
+<span class="normal"><a href="#__codelineno-0-443"> 443</a></span>
+<span class="normal"><a href="#__codelineno-0-444"> 444</a></span>
+<span class="normal"><a href="#__codelineno-0-445"> 445</a></span>
+<span class="normal"><a href="#__codelineno-0-446"> 446</a></span>
+<span class="normal"><a href="#__codelineno-0-447"> 447</a></span>
+<span class="normal"><a href="#__codelineno-0-448"> 448</a></span>
+<span class="normal"><a href="#__codelineno-0-449"> 449</a></span>
+<span class="normal"><a href="#__codelineno-0-450"> 450</a></span>
+<span class="normal"><a href="#__codelineno-0-451"> 451</a></span>
+<span class="normal"><a href="#__codelineno-0-452"> 452</a></span>
+<span class="normal"><a href="#__codelineno-0-453"> 453</a></span>
+<span class="normal"><a href="#__codelineno-0-454"> 454</a></span>
+<span class="normal"><a href="#__codelineno-0-455"> 455</a></span>
+<span class="normal"><a href="#__codelineno-0-456"> 456</a></span>
+<span class="normal"><a href="#__codelineno-0-457"> 457</a></span>
+<span class="normal"><a href="#__codelineno-0-458"> 458</a></span>
+<span class="normal"><a href="#__codelineno-0-459"> 459</a></span>
+<span class="normal"><a href="#__codelineno-0-460"> 460</a></span>
+<span class="normal"><a href="#__codelineno-0-461"> 461</a></span>
+<span class="normal"><a href="#__codelineno-0-462"> 462</a></span>
+<span class="normal"><a href="#__codelineno-0-463"> 463</a></span>
+<span class="normal"><a href="#__codelineno-0-464"> 464</a></span>
+<span class="normal"><a href="#__codelineno-0-465"> 465</a></span>
+<span class="normal"><a href="#__codelineno-0-466"> 466</a></span>
+<span class="normal"><a href="#__codelineno-0-467"> 467</a></span>
+<span class="normal"><a href="#__codelineno-0-468"> 468</a></span>
+<span class="normal"><a href="#__codelineno-0-469"> 469</a></span>
+<span class="normal"><a href="#__codelineno-0-470"> 470</a></span>
+<span class="normal"><a href="#__codelineno-0-471"> 471</a></span>
+<span class="normal"><a href="#__codelineno-0-472"> 472</a></span>
+<span class="normal"><a href="#__codelineno-0-473"> 473</a></span>
+<span class="normal"><a href="#__codelineno-0-474"> 474</a></span>
+<span class="normal"><a href="#__codelineno-0-475"> 475</a></span>
+<span class="normal"><a href="#__codelineno-0-476"> 476</a></span>
+<span class="normal"><a href="#__codelineno-0-477"> 477</a></span>
+<span class="normal"><a href="#__codelineno-0-478"> 478</a></span>
+<span class="normal"><a href="#__codelineno-0-479"> 479</a></span>
+<span class="normal"><a href="#__codelineno-0-480"> 480</a></span>
+<span class="normal"><a href="#__codelineno-0-481"> 481</a></span>
+<span class="normal"><a href="#__codelineno-0-482"> 482</a></span>
+<span class="normal"><a href="#__codelineno-0-483"> 483</a></span>
+<span class="normal"><a href="#__codelineno-0-484"> 484</a></span>
+<span class="normal"><a href="#__codelineno-0-485"> 485</a></span>
+<span class="normal"><a href="#__codelineno-0-486"> 486</a></span>
+<span class="normal"><a href="#__codelineno-0-487"> 487</a></span>
+<span class="normal"><a href="#__codelineno-0-488"> 488</a></span>
+<span class="normal"><a href="#__codelineno-0-489"> 489</a></span>
+<span class="normal"><a href="#__codelineno-0-490"> 490</a></span>
+<span class="normal"><a href="#__codelineno-0-491"> 491</a></span>
+<span class="normal"><a href="#__codelineno-0-492"> 492</a></span>
+<span class="normal"><a href="#__codelineno-0-493"> 493</a></span>
+<span class="normal"><a href="#__codelineno-0-494"> 494</a></span>
+<span class="normal"><a href="#__codelineno-0-495"> 495</a></span>
+<span class="normal"><a href="#__codelineno-0-496"> 496</a></span>
+<span class="normal"><a href="#__codelineno-0-497"> 497</a></span>
+<span class="normal"><a href="#__codelineno-0-498"> 498</a></span>
+<span class="normal"><a href="#__codelineno-0-499"> 499</a></span>
+<span class="normal"><a href="#__codelineno-0-500"> 500</a></span>
+<span class="normal"><a href="#__codelineno-0-501"> 501</a></span>
+<span class="normal"><a href="#__codelineno-0-502"> 502</a></span>
+<span class="normal"><a href="#__codelineno-0-503"> 503</a></span>
+<span class="normal"><a href="#__codelineno-0-504"> 504</a></span>
+<span class="normal"><a href="#__codelineno-0-505"> 505</a></span>
+<span class="normal"><a href="#__codelineno-0-506"> 506</a></span>
+<span class="normal"><a href="#__codelineno-0-507"> 507</a></span>
+<span class="normal"><a href="#__codelineno-0-508"> 508</a></span>
+<span class="normal"><a href="#__codelineno-0-509"> 509</a></span>
+<span class="normal"><a href="#__codelineno-0-510"> 510</a></span>
+<span class="normal"><a href="#__codelineno-0-511"> 511</a></span>
+<span class="normal"><a href="#__codelineno-0-512"> 512</a></span>
+<span class="normal"><a href="#__codelineno-0-513"> 513</a></span>
+<span class="normal"><a href="#__codelineno-0-514"> 514</a></span>
+<span class="normal"><a href="#__codelineno-0-515"> 515</a></span>
+<span class="normal"><a href="#__codelineno-0-516"> 516</a></span>
+<span class="normal"><a href="#__codelineno-0-517"> 517</a></span>
+<span class="normal"><a href="#__codelineno-0-518"> 518</a></span>
+<span class="normal"><a href="#__codelineno-0-519"> 519</a></span>
+<span class="normal"><a href="#__codelineno-0-520"> 520</a></span>
+<span class="normal"><a href="#__codelineno-0-521"> 521</a></span>
+<span class="normal"><a href="#__codelineno-0-522"> 522</a></span>
+<span class="normal"><a href="#__codelineno-0-523"> 523</a></span>
+<span class="normal"><a href="#__codelineno-0-524"> 524</a></span>
+<span class="normal"><a href="#__codelineno-0-525"> 525</a></span>
+<span class="normal"><a href="#__codelineno-0-526"> 526</a></span>
+<span class="normal"><a href="#__codelineno-0-527"> 527</a></span>
+<span class="normal"><a href="#__codelineno-0-528"> 528</a></span>
+<span class="normal"><a href="#__codelineno-0-529"> 529</a></span>
+<span class="normal"><a href="#__codelineno-0-530"> 530</a></span>
+<span class="normal"><a href="#__codelineno-0-531"> 531</a></span>
+<span class="normal"><a href="#__codelineno-0-532"> 532</a></span>
+<span class="normal"><a href="#__codelineno-0-533"> 533</a></span>
+<span class="normal"><a href="#__codelineno-0-534"> 534</a></span>
+<span class="normal"><a href="#__codelineno-0-535"> 535</a></span>
+<span class="normal"><a href="#__codelineno-0-536"> 536</a></span>
+<span class="normal"><a href="#__codelineno-0-537"> 537</a></span>
+<span class="normal"><a href="#__codelineno-0-538"> 538</a></span>
+<span class="normal"><a href="#__codelineno-0-539"> 539</a></span>
+<span class="normal"><a href="#__codelineno-0-540"> 540</a></span>
+<span class="normal"><a href="#__codelineno-0-541"> 541</a></span>
+<span class="normal"><a href="#__codelineno-0-542"> 542</a></span>
+<span class="normal"><a href="#__codelineno-0-543"> 543</a></span>
+<span class="normal"><a href="#__codelineno-0-544"> 544</a></span>
+<span class="normal"><a href="#__codelineno-0-545"> 545</a></span>
+<span class="normal"><a href="#__codelineno-0-546"> 546</a></span>
+<span class="normal"><a href="#__codelineno-0-547"> 547</a></span>
+<span class="normal"><a href="#__codelineno-0-548"> 548</a></span>
+<span class="normal"><a href="#__codelineno-0-549"> 549</a></span>
+<span class="normal"><a href="#__codelineno-0-550"> 550</a></span>
+<span class="normal"><a href="#__codelineno-0-551"> 551</a></span>
+<span class="normal"><a href="#__codelineno-0-552"> 552</a></span>
+<span class="normal"><a href="#__codelineno-0-553"> 553</a></span>
+<span class="normal"><a href="#__codelineno-0-554"> 554</a></span>
+<span class="normal"><a href="#__codelineno-0-555"> 555</a></span>
+<span class="normal"><a href="#__codelineno-0-556"> 556</a></span>
+<span class="normal"><a href="#__codelineno-0-557"> 557</a></span>
+<span class="normal"><a href="#__codelineno-0-558"> 558</a></span>
+<span class="normal"><a href="#__codelineno-0-559"> 559</a></span>
+<span class="normal"><a href="#__codelineno-0-560"> 560</a></span>
+<span class="normal"><a href="#__codelineno-0-561"> 561</a></span>
+<span class="normal"><a href="#__codelineno-0-562"> 562</a></span>
+<span class="normal"><a href="#__codelineno-0-563"> 563</a></span>
+<span class="normal"><a href="#__codelineno-0-564"> 564</a></span>
+<span class="normal"><a href="#__codelineno-0-565"> 565</a></span>
+<span class="normal"><a href="#__codelineno-0-566"> 566</a></span>
+<span class="normal"><a href="#__codelineno-0-567"> 567</a></span>
+<span class="normal"><a href="#__codelineno-0-568"> 568</a></span>
+<span class="normal"><a href="#__codelineno-0-569"> 569</a></span>
+<span class="normal"><a href="#__codelineno-0-570"> 570</a></span>
+<span class="normal"><a href="#__codelineno-0-571"> 571</a></span>
+<span class="normal"><a href="#__codelineno-0-572"> 572</a></span>
+<span class="normal"><a href="#__codelineno-0-573"> 573</a></span>
+<span class="normal"><a href="#__codelineno-0-574"> 574</a></span>
+<span class="normal"><a href="#__codelineno-0-575"> 575</a></span>
+<span class="normal"><a href="#__codelineno-0-576"> 576</a></span>
+<span class="normal"><a href="#__codelineno-0-577"> 577</a></span>
+<span class="normal"><a href="#__codelineno-0-578"> 578</a></span>
+<span class="normal"><a href="#__codelineno-0-579"> 579</a></span>
+<span class="normal"><a href="#__codelineno-0-580"> 580</a></span>
+<span class="normal"><a href="#__codelineno-0-581"> 581</a></span>
+<span class="normal"><a href="#__codelineno-0-582"> 582</a></span>
+<span class="normal"><a href="#__codelineno-0-583"> 583</a></span>
+<span class="normal"><a href="#__codelineno-0-584"> 584</a></span>
+<span class="normal"><a href="#__codelineno-0-585"> 585</a></span>
+<span class="normal"><a href="#__codelineno-0-586"> 586</a></span>
+<span class="normal"><a href="#__codelineno-0-587"> 587</a></span>
+<span class="normal"><a href="#__codelineno-0-588"> 588</a></span>
+<span class="normal"><a href="#__codelineno-0-589"> 589</a></span>
+<span class="normal"><a href="#__codelineno-0-590"> 590</a></span>
+<span class="normal"><a href="#__codelineno-0-591"> 591</a></span>
+<span class="normal"><a href="#__codelineno-0-592"> 592</a></span>
+<span class="normal"><a href="#__codelineno-0-593"> 593</a></span>
+<span class="normal"><a href="#__codelineno-0-594"> 594</a></span>
+<span class="normal"><a href="#__codelineno-0-595"> 595</a></span>
+<span class="normal"><a href="#__codelineno-0-596"> 596</a></span>
+<span class="normal"><a href="#__codelineno-0-597"> 597</a></span>
+<span class="normal"><a href="#__codelineno-0-598"> 598</a></span>
+<span class="normal"><a href="#__codelineno-0-599"> 599</a></span>
+<span class="normal"><a href="#__codelineno-0-600"> 600</a></span>
+<span class="normal"><a href="#__codelineno-0-601"> 601</a></span>
+<span class="normal"><a href="#__codelineno-0-602"> 602</a></span>
+<span class="normal"><a href="#__codelineno-0-603"> 603</a></span>
+<span class="normal"><a href="#__codelineno-0-604"> 604</a></span>
+<span class="normal"><a href="#__codelineno-0-605"> 605</a></span>
+<span class="normal"><a href="#__codelineno-0-606"> 606</a></span>
+<span class="normal"><a href="#__codelineno-0-607"> 607</a></span>
+<span class="normal"><a href="#__codelineno-0-608"> 608</a></span>
+<span class="normal"><a href="#__codelineno-0-609"> 609</a></span>
+<span class="normal"><a href="#__codelineno-0-610"> 610</a></span>
+<span class="normal"><a href="#__codelineno-0-611"> 611</a></span>
+<span class="normal"><a href="#__codelineno-0-612"> 612</a></span>
+<span class="normal"><a href="#__codelineno-0-613"> 613</a></span>
+<span class="normal"><a href="#__codelineno-0-614"> 614</a></span>
+<span class="normal"><a href="#__codelineno-0-615"> 615</a></span>
+<span class="normal"><a href="#__codelineno-0-616"> 616</a></span>
+<span class="normal"><a href="#__codelineno-0-617"> 617</a></span>
+<span class="normal"><a href="#__codelineno-0-618"> 618</a></span>
+<span class="normal"><a href="#__codelineno-0-619"> 619</a></span>
+<span class="normal"><a href="#__codelineno-0-620"> 620</a></span>
+<span class="normal"><a href="#__codelineno-0-621"> 621</a></span>
+<span class="normal"><a href="#__codelineno-0-622"> 622</a></span>
+<span class="normal"><a href="#__codelineno-0-623"> 623</a></span>
+<span class="normal"><a href="#__codelineno-0-624"> 624</a></span>
+<span class="normal"><a href="#__codelineno-0-625"> 625</a></span>
+<span class="normal"><a href="#__codelineno-0-626"> 626</a></span>
+<span class="normal"><a href="#__codelineno-0-627"> 627</a></span>
+<span class="normal"><a href="#__codelineno-0-628"> 628</a></span>
+<span class="normal"><a href="#__codelineno-0-629"> 629</a></span>
+<span class="normal"><a href="#__codelineno-0-630"> 630</a></span>
+<span class="normal"><a href="#__codelineno-0-631"> 631</a></span>
+<span class="normal"><a href="#__codelineno-0-632"> 632</a></span>
+<span class="normal"><a href="#__codelineno-0-633"> 633</a></span>
+<span class="normal"><a href="#__codelineno-0-634"> 634</a></span>
+<span class="normal"><a href="#__codelineno-0-635"> 635</a></span>
+<span class="normal"><a href="#__codelineno-0-636"> 636</a></span>
+<span class="normal"><a href="#__codelineno-0-637"> 637</a></span>
+<span class="normal"><a href="#__codelineno-0-638"> 638</a></span>
+<span class="normal"><a href="#__codelineno-0-639"> 639</a></span>
+<span class="normal"><a href="#__codelineno-0-640"> 640</a></span>
+<span class="normal"><a href="#__codelineno-0-641"> 641</a></span>
+<span class="normal"><a href="#__codelineno-0-642"> 642</a></span>
+<span class="normal"><a href="#__codelineno-0-643"> 643</a></span>
+<span class="normal"><a href="#__codelineno-0-644"> 644</a></span>
+<span class="normal"><a href="#__codelineno-0-645"> 645</a></span>
+<span class="normal"><a href="#__codelineno-0-646"> 646</a></span>
+<span class="normal"><a href="#__codelineno-0-647"> 647</a></span>
+<span class="normal"><a href="#__codelineno-0-648"> 648</a></span>
+<span class="normal"><a href="#__codelineno-0-649"> 649</a></span>
+<span class="normal"><a href="#__codelineno-0-650"> 650</a></span>
+<span class="normal"><a href="#__codelineno-0-651"> 651</a></span>
+<span class="normal"><a href="#__codelineno-0-652"> 652</a></span>
+<span class="normal"><a href="#__codelineno-0-653"> 653</a></span>
+<span class="normal"><a href="#__codelineno-0-654"> 654</a></span>
+<span class="normal"><a href="#__codelineno-0-655"> 655</a></span>
+<span class="normal"><a href="#__codelineno-0-656"> 656</a></span>
+<span class="normal"><a href="#__codelineno-0-657"> 657</a></span>
+<span class="normal"><a href="#__codelineno-0-658"> 658</a></span>
+<span class="normal"><a href="#__codelineno-0-659"> 659</a></span>
+<span class="normal"><a href="#__codelineno-0-660"> 660</a></span>
+<span class="normal"><a href="#__codelineno-0-661"> 661</a></span>
+<span class="normal"><a href="#__codelineno-0-662"> 662</a></span>
+<span class="normal"><a href="#__codelineno-0-663"> 663</a></span>
+<span class="normal"><a href="#__codelineno-0-664"> 664</a></span>
+<span class="normal"><a href="#__codelineno-0-665"> 665</a></span>
+<span class="normal"><a href="#__codelineno-0-666"> 666</a></span>
+<span class="normal"><a href="#__codelineno-0-667"> 667</a></span>
+<span class="normal"><a href="#__codelineno-0-668"> 668</a></span>
+<span class="normal"><a href="#__codelineno-0-669"> 669</a></span>
+<span class="normal"><a href="#__codelineno-0-670"> 670</a></span>
+<span class="normal"><a href="#__codelineno-0-671"> 671</a></span>
+<span class="normal"><a href="#__codelineno-0-672"> 672</a></span>
+<span class="normal"><a href="#__codelineno-0-673"> 673</a></span>
+<span class="normal"><a href="#__codelineno-0-674"> 674</a></span>
+<span class="normal"><a href="#__codelineno-0-675"> 675</a></span>
+<span class="normal"><a href="#__codelineno-0-676"> 676</a></span>
+<span class="normal"><a href="#__codelineno-0-677"> 677</a></span>
+<span class="normal"><a href="#__codelineno-0-678"> 678</a></span>
+<span class="normal"><a href="#__codelineno-0-679"> 679</a></span>
+<span class="normal"><a href="#__codelineno-0-680"> 680</a></span>
+<span class="normal"><a href="#__codelineno-0-681"> 681</a></span>
+<span class="normal"><a href="#__codelineno-0-682"> 682</a></span>
+<span class="normal"><a href="#__codelineno-0-683"> 683</a></span>
+<span class="normal"><a href="#__codelineno-0-684"> 684</a></span>
+<span class="normal"><a href="#__codelineno-0-685"> 685</a></span>
+<span class="normal"><a href="#__codelineno-0-686"> 686</a></span>
+<span class="normal"><a href="#__codelineno-0-687"> 687</a></span>
+<span class="normal"><a href="#__codelineno-0-688"> 688</a></span>
+<span class="normal"><a href="#__codelineno-0-689"> 689</a></span>
+<span class="normal"><a href="#__codelineno-0-690"> 690</a></span>
+<span class="normal"><a href="#__codelineno-0-691"> 691</a></span>
+<span class="normal"><a href="#__codelineno-0-692"> 692</a></span>
+<span class="normal"><a href="#__codelineno-0-693"> 693</a></span>
+<span class="normal"><a href="#__codelineno-0-694"> 694</a></span>
+<span class="normal"><a href="#__codelineno-0-695"> 695</a></span>
+<span class="normal"><a href="#__codelineno-0-696"> 696</a></span>
+<span class="normal"><a href="#__codelineno-0-697"> 697</a></span>
+<span class="normal"><a href="#__codelineno-0-698"> 698</a></span>
+<span class="normal"><a href="#__codelineno-0-699"> 699</a></span>
+<span class="normal"><a href="#__codelineno-0-700"> 700</a></span>
+<span class="normal"><a href="#__codelineno-0-701"> 701</a></span>
+<span class="normal"><a href="#__codelineno-0-702"> 702</a></span>
+<span class="normal"><a href="#__codelineno-0-703"> 703</a></span>
+<span class="normal"><a href="#__codelineno-0-704"> 704</a></span>
+<span class="normal"><a href="#__codelineno-0-705"> 705</a></span>
+<span class="normal"><a href="#__codelineno-0-706"> 706</a></span>
+<span class="normal"><a href="#__codelineno-0-707"> 707</a></span>
+<span class="normal"><a href="#__codelineno-0-708"> 708</a></span>
+<span class="normal"><a href="#__codelineno-0-709"> 709</a></span>
+<span class="normal"><a href="#__codelineno-0-710"> 710</a></span>
+<span class="normal"><a href="#__codelineno-0-711"> 711</a></span>
+<span class="normal"><a href="#__codelineno-0-712"> 712</a></span>
+<span class="normal"><a href="#__codelineno-0-713"> 713</a></span>
+<span class="normal"><a href="#__codelineno-0-714"> 714</a></span>
+<span class="normal"><a href="#__codelineno-0-715"> 715</a></span>
+<span class="normal"><a href="#__codelineno-0-716"> 716</a></span>
+<span class="normal"><a href="#__codelineno-0-717"> 717</a></span>
+<span class="normal"><a href="#__codelineno-0-718"> 718</a></span>
+<span class="normal"><a href="#__codelineno-0-719"> 719</a></span>
+<span class="normal"><a href="#__codelineno-0-720"> 720</a></span>
+<span class="normal"><a href="#__codelineno-0-721"> 721</a></span>
+<span class="normal"><a href="#__codelineno-0-722"> 722</a></span>
+<span class="normal"><a href="#__codelineno-0-723"> 723</a></span>
+<span class="normal"><a href="#__codelineno-0-724"> 724</a></span>
+<span class="normal"><a href="#__codelineno-0-725"> 725</a></span>
+<span class="normal"><a href="#__codelineno-0-726"> 726</a></span>
+<span class="normal"><a href="#__codelineno-0-727"> 727</a></span>
+<span class="normal"><a href="#__codelineno-0-728"> 728</a></span>
+<span class="normal"><a href="#__codelineno-0-729"> 729</a></span>
+<span class="normal"><a href="#__codelineno-0-730"> 730</a></span>
+<span class="normal"><a href="#__codelineno-0-731"> 731</a></span>
+<span class="normal"><a href="#__codelineno-0-732"> 732</a></span>
+<span class="normal"><a href="#__codelineno-0-733"> 733</a></span>
+<span class="normal"><a href="#__codelineno-0-734"> 734</a></span>
+<span class="normal"><a href="#__codelineno-0-735"> 735</a></span>
+<span class="normal"><a href="#__codelineno-0-736"> 736</a></span>
+<span class="normal"><a href="#__codelineno-0-737"> 737</a></span>
+<span class="normal"><a href="#__codelineno-0-738"> 738</a></span>
+<span class="normal"><a href="#__codelineno-0-739"> 739</a></span>
+<span class="normal"><a href="#__codelineno-0-740"> 740</a></span>
+<span class="normal"><a href="#__codelineno-0-741"> 741</a></span>
+<span class="normal"><a href="#__codelineno-0-742"> 742</a></span>
+<span class="normal"><a href="#__codelineno-0-743"> 743</a></span>
+<span class="normal"><a href="#__codelineno-0-744"> 744</a></span>
+<span class="normal"><a href="#__codelineno-0-745"> 745</a></span>
+<span class="normal"><a href="#__codelineno-0-746"> 746</a></span>
+<span class="normal"><a href="#__codelineno-0-747"> 747</a></span>
+<span class="normal"><a href="#__codelineno-0-748"> 748</a></span>
+<span class="normal"><a href="#__codelineno-0-749"> 749</a></span>
+<span class="normal"><a href="#__codelineno-0-750"> 750</a></span>
+<span class="normal"><a href="#__codelineno-0-751"> 751</a></span>
+<span class="normal"><a href="#__codelineno-0-752"> 752</a></span>
+<span class="normal"><a href="#__codelineno-0-753"> 753</a></span>
+<span class="normal"><a href="#__codelineno-0-754"> 754</a></span>
+<span class="normal"><a href="#__codelineno-0-755"> 755</a></span>
+<span class="normal"><a href="#__codelineno-0-756"> 756</a></span>
+<span class="normal"><a href="#__codelineno-0-757"> 757</a></span>
+<span class="normal"><a href="#__codelineno-0-758"> 758</a></span>
+<span class="normal"><a href="#__codelineno-0-759"> 759</a></span>
+<span class="normal"><a href="#__codelineno-0-760"> 760</a></span>
+<span class="normal"><a href="#__codelineno-0-761"> 761</a></span>
+<span class="normal"><a href="#__codelineno-0-762"> 762</a></span>
+<span class="normal"><a href="#__codelineno-0-763"> 763</a></span>
+<span class="normal"><a href="#__codelineno-0-764"> 764</a></span>
+<span class="normal"><a href="#__codelineno-0-765"> 765</a></span>
+<span class="normal"><a href="#__codelineno-0-766"> 766</a></span>
+<span class="normal"><a href="#__codelineno-0-767"> 767</a></span>
+<span class="normal"><a href="#__codelineno-0-768"> 768</a></span>
+<span class="normal"><a href="#__codelineno-0-769"> 769</a></span>
+<span class="normal"><a href="#__codelineno-0-770"> 770</a></span>
+<span class="normal"><a href="#__codelineno-0-771"> 771</a></span>
+<span class="normal"><a href="#__codelineno-0-772"> 772</a></span>
+<span class="normal"><a href="#__codelineno-0-773"> 773</a></span>
+<span class="normal"><a href="#__codelineno-0-774"> 774</a></span>
+<span class="normal"><a href="#__codelineno-0-775"> 775</a></span>
+<span class="normal"><a href="#__codelineno-0-776"> 776</a></span>
+<span class="normal"><a href="#__codelineno-0-777"> 777</a></span>
+<span class="normal"><a href="#__codelineno-0-778"> 778</a></span>
+<span class="normal"><a href="#__codelineno-0-779"> 779</a></span>
+<span class="normal"><a href="#__codelineno-0-780"> 780</a></span>
+<span class="normal"><a href="#__codelineno-0-781"> 781</a></span>
+<span class="normal"><a href="#__codelineno-0-782"> 782</a></span>
+<span class="normal"><a href="#__codelineno-0-783"> 783</a></span>
+<span class="normal"><a href="#__codelineno-0-784"> 784</a></span>
+<span class="normal"><a href="#__codelineno-0-785"> 785</a></span>
+<span class="normal"><a href="#__codelineno-0-786"> 786</a></span>
+<span class="normal"><a href="#__codelineno-0-787"> 787</a></span>
+<span class="normal"><a href="#__codelineno-0-788"> 788</a></span>
+<span class="normal"><a href="#__codelineno-0-789"> 789</a></span>
+<span class="normal"><a href="#__codelineno-0-790"> 790</a></span>
+<span class="normal"><a href="#__codelineno-0-791"> 791</a></span>
+<span class="normal"><a href="#__codelineno-0-792"> 792</a></span>
+<span class="normal"><a href="#__codelineno-0-793"> 793</a></span>
+<span class="normal"><a href="#__codelineno-0-794"> 794</a></span>
+<span class="normal"><a href="#__codelineno-0-795"> 795</a></span>
+<span class="normal"><a href="#__codelineno-0-796"> 796</a></span>
+<span class="normal"><a href="#__codelineno-0-797"> 797</a></span>
+<span class="normal"><a href="#__codelineno-0-798"> 798</a></span>
+<span class="normal"><a href="#__codelineno-0-799"> 799</a></span>
+<span class="normal"><a href="#__codelineno-0-800"> 800</a></span>
+<span class="normal"><a href="#__codelineno-0-801"> 801</a></span>
+<span class="normal"><a href="#__codelineno-0-802"> 802</a></span>
+<span class="normal"><a href="#__codelineno-0-803"> 803</a></span>
+<span class="normal"><a href="#__codelineno-0-804"> 804</a></span>
+<span class="normal"><a href="#__codelineno-0-805"> 805</a></span>
+<span class="normal"><a href="#__codelineno-0-806"> 806</a></span>
+<span class="normal"><a href="#__codelineno-0-807"> 807</a></span>
+<span class="normal"><a href="#__codelineno-0-808"> 808</a></span>
+<span class="normal"><a href="#__codelineno-0-809"> 809</a></span>
+<span class="normal"><a href="#__codelineno-0-810"> 810</a></span>
+<span class="normal"><a href="#__codelineno-0-811"> 811</a></span>
+<span class="normal"><a href="#__codelineno-0-812"> 812</a></span>
+<span class="normal"><a href="#__codelineno-0-813"> 813</a></span>
+<span class="normal"><a href="#__codelineno-0-814"> 814</a></span>
+<span class="normal"><a href="#__codelineno-0-815"> 815</a></span>
+<span class="normal"><a href="#__codelineno-0-816"> 816</a></span>
+<span class="normal"><a href="#__codelineno-0-817"> 817</a></span>
+<span class="normal"><a href="#__codelineno-0-818"> 818</a></span>
+<span class="normal"><a href="#__codelineno-0-819"> 819</a></span>
+<span class="normal"><a href="#__codelineno-0-820"> 820</a></span>
+<span class="normal"><a href="#__codelineno-0-821"> 821</a></span>
+<span class="normal"><a href="#__codelineno-0-822"> 822</a></span>
+<span class="normal"><a href="#__codelineno-0-823"> 823</a></span>
+<span class="normal"><a href="#__codelineno-0-824"> 824</a></span>
+<span class="normal"><a href="#__codelineno-0-825"> 825</a></span>
+<span class="normal"><a href="#__codelineno-0-826"> 826</a></span>
+<span class="normal"><a href="#__codelineno-0-827"> 827</a></span>
+<span class="normal"><a href="#__codelineno-0-828"> 828</a></span>
+<span class="normal"><a href="#__codelineno-0-829"> 829</a></span>
+<span class="normal"><a href="#__codelineno-0-830"> 830</a></span>
+<span class="normal"><a href="#__codelineno-0-831"> 831</a></span>
+<span class="normal"><a href="#__codelineno-0-832"> 832</a></span>
+<span class="normal"><a href="#__codelineno-0-833"> 833</a></span>
+<span class="normal"><a href="#__codelineno-0-834"> 834</a></span>
+<span class="normal"><a href="#__codelineno-0-835"> 835</a></span>
+<span class="normal"><a href="#__codelineno-0-836"> 836</a></span>
+<span class="normal"><a href="#__codelineno-0-837"> 837</a></span>
+<span class="normal"><a href="#__codelineno-0-838"> 838</a></span>
+<span class="normal"><a href="#__codelineno-0-839"> 839</a></span>
+<span class="normal"><a href="#__codelineno-0-840"> 840</a></span>
+<span class="normal"><a href="#__codelineno-0-841"> 841</a></span>
+<span class="normal"><a href="#__codelineno-0-842"> 842</a></span>
+<span class="normal"><a href="#__codelineno-0-843"> 843</a></span>
+<span class="normal"><a href="#__codelineno-0-844"> 844</a></span>
+<span class="normal"><a href="#__codelineno-0-845"> 845</a></span>
+<span class="normal"><a href="#__codelineno-0-846"> 846</a></span>
+<span class="normal"><a href="#__codelineno-0-847"> 847</a></span>
+<span class="normal"><a href="#__codelineno-0-848"> 848</a></span>
+<span class="normal"><a href="#__codelineno-0-849"> 849</a></span>
+<span class="normal"><a href="#__codelineno-0-850"> 850</a></span>
+<span class="normal"><a href="#__codelineno-0-851"> 851</a></span>
+<span class="normal"><a href="#__codelineno-0-852"> 852</a></span>
+<span class="normal"><a href="#__codelineno-0-853"> 853</a></span>
+<span class="normal"><a href="#__codelineno-0-854"> 854</a></span>
+<span class="normal"><a href="#__codelineno-0-855"> 855</a></span>
+<span class="normal"><a href="#__codelineno-0-856"> 856</a></span>
+<span class="normal"><a href="#__codelineno-0-857"> 857</a></span>
+<span class="normal"><a href="#__codelineno-0-858"> 858</a></span>
+<span class="normal"><a href="#__codelineno-0-859"> 859</a></span>
+<span class="normal"><a href="#__codelineno-0-860"> 860</a></span>
+<span class="normal"><a href="#__codelineno-0-861"> 861</a></span>
+<span class="normal"><a href="#__codelineno-0-862"> 862</a></span>
+<span class="normal"><a href="#__codelineno-0-863"> 863</a></span>
+<span class="normal"><a href="#__codelineno-0-864"> 864</a></span>
+<span class="normal"><a href="#__codelineno-0-865"> 865</a></span>
+<span class="normal"><a href="#__codelineno-0-866"> 866</a></span>
+<span class="normal"><a href="#__codelineno-0-867"> 867</a></span>
+<span class="normal"><a href="#__codelineno-0-868"> 868</a></span>
+<span class="normal"><a href="#__codelineno-0-869"> 869</a></span>
+<span class="normal"><a href="#__codelineno-0-870"> 870</a></span>
+<span class="normal"><a href="#__codelineno-0-871"> 871</a></span>
+<span class="normal"><a href="#__codelineno-0-872"> 872</a></span>
+<span class="normal"><a href="#__codelineno-0-873"> 873</a></span>
+<span class="normal"><a href="#__codelineno-0-874"> 874</a></span>
+<span class="normal"><a href="#__codelineno-0-875"> 875</a></span>
+<span class="normal"><a href="#__codelineno-0-876"> 876</a></span>
+<span class="normal"><a href="#__codelineno-0-877"> 877</a></span>
+<span class="normal"><a href="#__codelineno-0-878"> 878</a></span>
+<span class="normal"><a href="#__codelineno-0-879"> 879</a></span>
+<span class="normal"><a href="#__codelineno-0-880"> 880</a></span>
+<span class="normal"><a href="#__codelineno-0-881"> 881</a></span>
+<span class="normal"><a href="#__codelineno-0-882"> 882</a></span>
+<span class="normal"><a href="#__codelineno-0-883"> 883</a></span>
+<span class="normal"><a href="#__codelineno-0-884"> 884</a></span>
+<span class="normal"><a href="#__codelineno-0-885"> 885</a></span>
+<span class="normal"><a href="#__codelineno-0-886"> 886</a></span>
+<span class="normal"><a href="#__codelineno-0-887"> 887</a></span>
+<span class="normal"><a href="#__codelineno-0-888"> 888</a></span>
+<span class="normal"><a href="#__codelineno-0-889"> 889</a></span>
+<span class="normal"><a href="#__codelineno-0-890"> 890</a></span>
+<span class="normal"><a href="#__codelineno-0-891"> 891</a></span>
+<span class="normal"><a href="#__codelineno-0-892"> 892</a></span>
+<span class="normal"><a href="#__codelineno-0-893"> 893</a></span>
+<span class="normal"><a href="#__codelineno-0-894"> 894</a></span>
+<span class="normal"><a href="#__codelineno-0-895"> 895</a></span>
+<span class="normal"><a href="#__codelineno-0-896"> 896</a></span>
+<span class="normal"><a href="#__codelineno-0-897"> 897</a></span>
+<span class="normal"><a href="#__codelineno-0-898"> 898</a></span>
+<span class="normal"><a href="#__codelineno-0-899"> 899</a></span>
+<span class="normal"><a href="#__codelineno-0-900"> 900</a></span>
+<span class="normal"><a href="#__codelineno-0-901"> 901</a></span>
+<span class="normal"><a href="#__codelineno-0-902"> 902</a></span>
+<span class="normal"><a href="#__codelineno-0-903"> 903</a></span>
+<span class="normal"><a href="#__codelineno-0-904"> 904</a></span>
+<span class="normal"><a href="#__codelineno-0-905"> 905</a></span>
+<span class="normal"><a href="#__codelineno-0-906"> 906</a></span>
+<span class="normal"><a href="#__codelineno-0-907"> 907</a></span>
+<span class="normal"><a href="#__codelineno-0-908"> 908</a></span>
+<span class="normal"><a href="#__codelineno-0-909"> 909</a></span>
+<span class="normal"><a href="#__codelineno-0-910"> 910</a></span>
+<span class="normal"><a href="#__codelineno-0-911"> 911</a></span>
+<span class="normal"><a href="#__codelineno-0-912"> 912</a></span>
+<span class="normal"><a href="#__codelineno-0-913"> 913</a></span>
+<span class="normal"><a href="#__codelineno-0-914"> 914</a></span>
+<span class="normal"><a href="#__codelineno-0-915"> 915</a></span>
+<span class="normal"><a href="#__codelineno-0-916"> 916</a></span>
+<span class="normal"><a href="#__codelineno-0-917"> 917</a></span>
+<span class="normal"><a href="#__codelineno-0-918"> 918</a></span>
+<span class="normal"><a href="#__codelineno-0-919"> 919</a></span>
+<span class="normal"><a href="#__codelineno-0-920"> 920</a></span>
+<span class="normal"><a href="#__codelineno-0-921"> 921</a></span>
+<span class="normal"><a href="#__codelineno-0-922"> 922</a></span>
+<span class="normal"><a href="#__codelineno-0-923"> 923</a></span>
+<span class="normal"><a href="#__codelineno-0-924"> 924</a></span>
+<span class="normal"><a href="#__codelineno-0-925"> 925</a></span>
+<span class="normal"><a href="#__codelineno-0-926"> 926</a></span>
+<span class="normal"><a href="#__codelineno-0-927"> 927</a></span>
+<span class="normal"><a href="#__codelineno-0-928"> 928</a></span>
+<span class="normal"><a href="#__codelineno-0-929"> 929</a></span>
+<span class="normal"><a href="#__codelineno-0-930"> 930</a></span>
+<span class="normal"><a href="#__codelineno-0-931"> 931</a></span>
+<span class="normal"><a href="#__codelineno-0-932"> 932</a></span>
+<span class="normal"><a href="#__codelineno-0-933"> 933</a></span>
+<span class="normal"><a href="#__codelineno-0-934"> 934</a></span>
+<span class="normal"><a href="#__codelineno-0-935"> 935</a></span>
+<span class="normal"><a href="#__codelineno-0-936"> 936</a></span>
+<span class="normal"><a href="#__codelineno-0-937"> 937</a></span>
+<span class="normal"><a href="#__codelineno-0-938"> 938</a></span>
+<span class="normal"><a href="#__codelineno-0-939"> 939</a></span>
+<span class="normal"><a href="#__codelineno-0-940"> 940</a></span>
+<span class="normal"><a href="#__codelineno-0-941"> 941</a></span>
+<span class="normal"><a href="#__codelineno-0-942"> 942</a></span>
+<span class="normal"><a href="#__codelineno-0-943"> 943</a></span>
+<span class="normal"><a href="#__codelineno-0-944"> 944</a></span>
+<span class="normal"><a href="#__codelineno-0-945"> 945</a></span>
+<span class="normal"><a href="#__codelineno-0-946"> 946</a></span>
+<span class="normal"><a href="#__codelineno-0-947"> 947</a></span>
+<span class="normal"><a href="#__codelineno-0-948"> 948</a></span>
+<span class="normal"><a href="#__codelineno-0-949"> 949</a></span>
+<span class="normal"><a href="#__codelineno-0-950"> 950</a></span>
+<span class="normal"><a href="#__codelineno-0-951"> 951</a></span>
+<span class="normal"><a href="#__codelineno-0-952"> 952</a></span>
+<span class="normal"><a href="#__codelineno-0-953"> 953</a></span>
+<span class="normal"><a href="#__codelineno-0-954"> 954</a></span>
+<span class="normal"><a href="#__codelineno-0-955"> 955</a></span>
+<span class="normal"><a href="#__codelineno-0-956"> 956</a></span>
+<span class="normal"><a href="#__codelineno-0-957"> 957</a></span>
+<span class="normal"><a href="#__codelineno-0-958"> 958</a></span>
+<span class="normal"><a href="#__codelineno-0-959"> 959</a></span>
+<span class="normal"><a href="#__codelineno-0-960"> 960</a></span>
+<span class="normal"><a href="#__codelineno-0-961"> 961</a></span>
+<span class="normal"><a href="#__codelineno-0-962"> 962</a></span>
+<span class="normal"><a href="#__codelineno-0-963"> 963</a></span>
+<span class="normal"><a href="#__codelineno-0-964"> 964</a></span>
+<span class="normal"><a href="#__codelineno-0-965"> 965</a></span>
+<span class="normal"><a href="#__codelineno-0-966"> 966</a></span>
+<span class="normal"><a href="#__codelineno-0-967"> 967</a></span>
+<span class="normal"><a href="#__codelineno-0-968"> 968</a></span>
+<span class="normal"><a href="#__codelineno-0-969"> 969</a></span>
+<span class="normal"><a href="#__codelineno-0-970"> 970</a></span>
+<span class="normal"><a href="#__codelineno-0-971"> 971</a></span>
+<span class="normal"><a href="#__codelineno-0-972"> 972</a></span>
+<span class="normal"><a href="#__codelineno-0-973"> 973</a></span>
+<span class="normal"><a href="#__codelineno-0-974"> 974</a></span>
+<span class="normal"><a href="#__codelineno-0-975"> 975</a></span>
+<span class="normal"><a href="#__codelineno-0-976"> 976</a></span>
+<span class="normal"><a href="#__codelineno-0-977"> 977</a></span>
+<span class="normal"><a href="#__codelineno-0-978"> 978</a></span>
+<span class="normal"><a href="#__codelineno-0-979"> 979</a></span>
+<span class="normal"><a href="#__codelineno-0-980"> 980</a></span>
+<span class="normal"><a href="#__codelineno-0-981"> 981</a></span>
+<span class="normal"><a href="#__codelineno-0-982"> 982</a></span>
+<span class="normal"><a href="#__codelineno-0-983"> 983</a></span>
+<span class="normal"><a href="#__codelineno-0-984"> 984</a></span>
+<span class="normal"><a href="#__codelineno-0-985"> 985</a></span>
+<span class="normal"><a href="#__codelineno-0-986"> 986</a></span>
+<span class="normal"><a href="#__codelineno-0-987"> 987</a></span>
+<span class="normal"><a href="#__codelineno-0-988"> 988</a></span>
+<span class="normal"><a href="#__codelineno-0-989"> 989</a></span>
+<span class="normal"><a href="#__codelineno-0-990"> 990</a></span>
+<span class="normal"><a href="#__codelineno-0-991"> 991</a></span>
+<span class="normal"><a href="#__codelineno-0-992"> 992</a></span>
+<span class="normal"><a href="#__codelineno-0-993"> 993</a></span>
+<span class="normal"><a href="#__codelineno-0-994"> 994</a></span>
+<span class="normal"><a href="#__codelineno-0-995"> 995</a></span>
+<span class="normal"><a href="#__codelineno-0-996"> 996</a></span>
+<span class="normal"><a href="#__codelineno-0-997"> 997</a></span>
+<span class="normal"><a href="#__codelineno-0-998"> 998</a></span>
+<span class="normal"><a href="#__codelineno-0-999"> 999</a></span>
+<span class="normal"><a href="#__codelineno-0-1000">1000</a></span>
+<span class="normal"><a href="#__codelineno-0-1001">1001</a></span>
+<span class="normal"><a href="#__codelineno-0-1002">1002</a></span>
+<span class="normal"><a href="#__codelineno-0-1003">1003</a></span>
+<span class="normal"><a href="#__codelineno-0-1004">1004</a></span>
+<span class="normal"><a href="#__codelineno-0-1005">1005</a></span>
+<span class="normal"><a href="#__codelineno-0-1006">1006</a></span>
+<span class="normal"><a href="#__codelineno-0-1007">1007</a></span>
+<span class="normal"><a href="#__codelineno-0-1008">1008</a></span>
+<span class="normal"><a href="#__codelineno-0-1009">1009</a></span>
+<span class="normal"><a href="#__codelineno-0-1010">1010</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-128"><a id="__codelineno-0-128" name="__codelineno-0-128"></a><span class="k">class</span> <span class="nc">BaseDataset</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span>
+</span><span id="__span-0-129"><a id="__codelineno-0-129" name="__codelineno-0-129"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Base class for all datasets. It implements all generic loading, processing, and writing methods.&quot;&quot;&quot;</span>
+</span><span id="__span-0-130"><a id="__codelineno-0-130" name="__codelineno-0-130"></a>
+</span><span id="__span-0-131"><a id="__codelineno-0-131" name="__codelineno-0-131"></a>    <span class="n">DATASET_ID</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-132"><a id="__codelineno-0-132" name="__codelineno-0-132"></a>    <span class="n">SOURCE_ID</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-133"><a id="__codelineno-0-133" name="__codelineno-0-133"></a>
+</span><span id="__span-0-134"><a id="__codelineno-0-134" name="__codelineno-0-134"></a>    <span class="n">TITLE</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-135"><a id="__codelineno-0-135" name="__codelineno-0-135"></a>    <span class="n">DESCRIPTION</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span>
+</span><span id="__span-0-136"><a id="__codelineno-0-136" name="__codelineno-0-136"></a>    <span class="n">HOMEPAGE</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-137"><a id="__codelineno-0-137" name="__codelineno-0-137"></a>    <span class="n">AVAILIBILITY</span><span class="p">:</span> <span class="n">Availability</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-138"><a id="__codelineno-0-138" name="__codelineno-0-138"></a>    <span class="n">DOWNLOAD_URLS</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="p">[]</span>
+</span><span id="__span-0-139"><a id="__codelineno-0-139" name="__codelineno-0-139"></a>    <span class="n">LOCAL_DIRS</span> <span class="o">=</span> <span class="p">[]</span>
+</span><span id="__span-0-140"><a id="__codelineno-0-140" name="__codelineno-0-140"></a>    <span class="n">VERSION</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-141"><a id="__codelineno-0-141" name="__codelineno-0-141"></a>    <span class="n">DOI</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-142"><a id="__codelineno-0-142" name="__codelineno-0-142"></a>    <span class="n">CITATION</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-143"><a id="__codelineno-0-143" name="__codelineno-0-143"></a>
+</span><span id="__span-0-144"><a id="__codelineno-0-144" name="__codelineno-0-144"></a>    <span class="n">LICENSE</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">License</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-145"><a id="__codelineno-0-145" name="__codelineno-0-145"></a>    <span class="n">PII</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-146"><a id="__codelineno-0-146" name="__codelineno-0-146"></a>
+</span><span id="__span-0-147"><a id="__codelineno-0-147" name="__codelineno-0-147"></a>    <span class="n">LANGUAGES</span> <span class="o">=</span> <span class="p">[]</span>
+</span><span id="__span-0-148"><a id="__codelineno-0-148" name="__codelineno-0-148"></a>
+</span><span id="__span-0-149"><a id="__codelineno-0-149" name="__codelineno-0-149"></a>    <span class="n">SUPERVISED</span> <span class="o">=</span> <span class="kc">False</span>
+</span><span id="__span-0-150"><a id="__codelineno-0-150" name="__codelineno-0-150"></a>    <span class="n">TRANSLATIONS</span> <span class="o">=</span> <span class="kc">False</span>
+</span><span id="__span-0-151"><a id="__codelineno-0-151" name="__codelineno-0-151"></a>    <span class="n">WEB_CRAWLED</span> <span class="o">=</span> <span class="kc">False</span>
+</span><span id="__span-0-152"><a id="__codelineno-0-152" name="__codelineno-0-152"></a>    <span class="n">QUALITY_WARNINGS</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">QualityWarning</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
+</span><span id="__span-0-153"><a id="__codelineno-0-153" name="__codelineno-0-153"></a>    <span class="n">GENRES</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Genre</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
+</span><span id="__span-0-154"><a id="__codelineno-0-154" name="__codelineno-0-154"></a>    <span class="n">HAS_OVERLAP_WITH</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Type</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="p">[]</span>
+</span><span id="__span-0-155"><a id="__codelineno-0-155" name="__codelineno-0-155"></a>    <span class="n">USED_BY</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-156"><a id="__codelineno-0-156" name="__codelineno-0-156"></a>    <span class="n">DUMMY</span> <span class="o">=</span> <span class="kc">False</span>
+</span><span id="__span-0-157"><a id="__codelineno-0-157" name="__codelineno-0-157"></a>    <span class="n">SINGLE_OUTPUT_FILE</span> <span class="o">=</span> <span class="kc">True</span>
+</span><span id="__span-0-158"><a id="__codelineno-0-158" name="__codelineno-0-158"></a>    <span class="n">HAS_PREDEFINED_VALIDATION_SET</span> <span class="o">=</span> <span class="kc">False</span>
+</span><span id="__span-0-159"><a id="__codelineno-0-159" name="__codelineno-0-159"></a>
+</span><span id="__span-0-160"><a id="__codelineno-0-160" name="__codelineno-0-160"></a>    <span class="c1"># Statistics</span>
+</span><span id="__span-0-161"><a id="__codelineno-0-161" name="__codelineno-0-161"></a>    <span class="n">TOKENS</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-162"><a id="__codelineno-0-162" name="__codelineno-0-162"></a>    <span class="n">BYTES</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-163"><a id="__codelineno-0-163" name="__codelineno-0-163"></a>
+</span><span id="__span-0-164"><a id="__codelineno-0-164" name="__codelineno-0-164"></a>    <span class="n">counter</span> <span class="o">=</span> <span class="n">Counter</span><span class="p">()</span>
+</span><span id="__span-0-165"><a id="__codelineno-0-165" name="__codelineno-0-165"></a>
+</span><span id="__span-0-166"><a id="__codelineno-0-166" name="__codelineno-0-166"></a>    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+</span><span id="__span-0-167"><a id="__codelineno-0-167" name="__codelineno-0-167"></a>        <span class="bp">self</span><span class="p">,</span>
+</span><span id="__span-0-168"><a id="__codelineno-0-168" name="__codelineno-0-168"></a>        <span class="n">text_datasets_dir</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+</span><span id="__span-0-169"><a id="__codelineno-0-169" name="__codelineno-0-169"></a>        <span class="n">raw_datasets_dir</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+</span><span id="__span-0-170"><a id="__codelineno-0-170" name="__codelineno-0-170"></a>        <span class="n">workers</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+</span><span id="__span-0-171"><a id="__codelineno-0-171" name="__codelineno-0-171"></a>        <span class="n">output_text_field</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;text&quot;</span><span class="p">,</span>
+</span><span id="__span-0-172"><a id="__codelineno-0-172" name="__codelineno-0-172"></a>        <span class="n">override_output</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+</span><span id="__span-0-173"><a id="__codelineno-0-173" name="__codelineno-0-173"></a>        <span class="n">limit</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
+</span><span id="__span-0-174"><a id="__codelineno-0-174" name="__codelineno-0-174"></a>        <span class="n">skip_items</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
+</span><span id="__span-0-175"><a id="__codelineno-0-175" name="__codelineno-0-175"></a>        <span class="n">hf_auth_token</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+</span><span id="__span-0-176"><a id="__codelineno-0-176" name="__codelineno-0-176"></a>        <span class="n">print_write_progress</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10_000</span><span class="p">,</span>
+</span><span id="__span-0-177"><a id="__codelineno-0-177" name="__codelineno-0-177"></a>        <span class="n">min_length</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+</span><span id="__span-0-178"><a id="__codelineno-0-178" name="__codelineno-0-178"></a>        <span class="n">json_ensure_ascii</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+</span><span id="__span-0-179"><a id="__codelineno-0-179" name="__codelineno-0-179"></a>        <span class="n">title_delimiter</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;:</span><span class="se">\n\n</span><span class="s2">&quot;</span><span class="p">,</span>
+</span><span id="__span-0-180"><a id="__codelineno-0-180" name="__codelineno-0-180"></a>        <span class="n">paragraph_delimiter</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;</span><span class="se">\n\n</span><span class="s2">&quot;</span><span class="p">,</span>
+</span><span id="__span-0-181"><a id="__codelineno-0-181" name="__codelineno-0-181"></a>        <span class="n">sentence_delimiter</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot; &quot;</span><span class="p">,</span>
+</span><span id="__span-0-182"><a id="__codelineno-0-182" name="__codelineno-0-182"></a>        <span class="n">output_format</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span><span class="s2">&quot;jsonl&quot;</span><span class="p">,</span> <span class="s2">&quot;parquet&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;jsonl&quot;</span><span class="p">,</span>
+</span><span id="__span-0-183"><a id="__codelineno-0-183" name="__codelineno-0-183"></a>        <span class="n">output_compression</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span>
+</span><span id="__span-0-184"><a id="__codelineno-0-184" name="__codelineno-0-184"></a>            <span class="nb">str</span>
+</span><span id="__span-0-185"><a id="__codelineno-0-185" name="__codelineno-0-185"></a>        <span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>  <span class="c1"># jsonl: gzip, parquet: ‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’</span>
+</span><span id="__span-0-186"><a id="__codelineno-0-186" name="__codelineno-0-186"></a>        <span class="n">output_batch_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1000</span><span class="p">,</span>
+</span><span id="__span-0-187"><a id="__codelineno-0-187" name="__codelineno-0-187"></a>        <span class="n">shuffled_datasets_dir</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+</span><span id="__span-0-188"><a id="__codelineno-0-188" name="__codelineno-0-188"></a>        <span class="n">max_output_chunk_uncompressed_bytes</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+</span><span id="__span-0-189"><a id="__codelineno-0-189" name="__codelineno-0-189"></a>        <span class="n">max_output_chunk_rows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+</span><span id="__span-0-190"><a id="__codelineno-0-190" name="__codelineno-0-190"></a>        <span class="n">config</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Config</span><span class="p">,</span> <span class="nb">dict</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+</span><span id="__span-0-191"><a id="__codelineno-0-191" name="__codelineno-0-191"></a>        <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+</span><span id="__span-0-192"><a id="__codelineno-0-192" name="__codelineno-0-192"></a>    <span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-193"><a id="__codelineno-0-193" name="__codelineno-0-193"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">text_datasets_dir</span> <span class="o">=</span> <span class="n">text_datasets_dir</span>
+</span><span id="__span-0-194"><a id="__codelineno-0-194" name="__codelineno-0-194"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">raw_datasets_dir</span> <span class="o">=</span> <span class="n">raw_datasets_dir</span>
+</span><span id="__span-0-195"><a id="__codelineno-0-195" name="__codelineno-0-195"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">workers</span> <span class="o">=</span> <span class="n">workers</span>
+</span><span id="__span-0-196"><a id="__codelineno-0-196" name="__codelineno-0-196"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">output_text_field</span> <span class="o">=</span> <span class="n">output_text_field</span>
+</span><span id="__span-0-197"><a id="__codelineno-0-197" name="__codelineno-0-197"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">override_output</span> <span class="o">=</span> <span class="n">override_output</span>
+</span><span id="__span-0-198"><a id="__codelineno-0-198" name="__codelineno-0-198"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">limit</span> <span class="o">=</span> <span class="n">limit</span>
+</span><span id="__span-0-199"><a id="__codelineno-0-199" name="__codelineno-0-199"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">skip_items</span> <span class="o">=</span> <span class="n">skip_items</span>
+</span><span id="__span-0-200"><a id="__codelineno-0-200" name="__codelineno-0-200"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">hf_auth_token</span> <span class="o">=</span> <span class="n">hf_auth_token</span>
+</span><span id="__span-0-201"><a id="__codelineno-0-201" name="__codelineno-0-201"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">print_write_progress</span> <span class="o">=</span> <span class="n">print_write_progress</span>
+</span><span id="__span-0-202"><a id="__codelineno-0-202" name="__codelineno-0-202"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">min_length</span> <span class="o">=</span> <span class="n">min_length</span> <span class="k">if</span> <span class="n">min_length</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">DEFAULT_MIN_TEXT_LENGTH</span>
+</span><span id="__span-0-203"><a id="__codelineno-0-203" name="__codelineno-0-203"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">json_ensure_ascii</span> <span class="o">=</span> <span class="n">json_ensure_ascii</span>
+</span><span id="__span-0-204"><a id="__codelineno-0-204" name="__codelineno-0-204"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">title_delimiter</span> <span class="o">=</span> <span class="n">title_delimiter</span>
+</span><span id="__span-0-205"><a id="__codelineno-0-205" name="__codelineno-0-205"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">paragraph_delimiter</span> <span class="o">=</span> <span class="n">paragraph_delimiter</span>
+</span><span id="__span-0-206"><a id="__codelineno-0-206" name="__codelineno-0-206"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">sentence_delimiter</span> <span class="o">=</span> <span class="n">sentence_delimiter</span>
+</span><span id="__span-0-207"><a id="__codelineno-0-207" name="__codelineno-0-207"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">=</span> <span class="n">output_format</span>
+</span><span id="__span-0-208"><a id="__codelineno-0-208" name="__codelineno-0-208"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">output_compression</span> <span class="o">=</span> <span class="n">output_compression</span>
+</span><span id="__span-0-209"><a id="__codelineno-0-209" name="__codelineno-0-209"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">output_batch_size</span> <span class="o">=</span> <span class="n">output_batch_size</span>
+</span><span id="__span-0-210"><a id="__codelineno-0-210" name="__codelineno-0-210"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">shuffled_datasets_dir</span> <span class="o">=</span> <span class="n">shuffled_datasets_dir</span>
+</span><span id="__span-0-211"><a id="__codelineno-0-211" name="__codelineno-0-211"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">max_output_chunk_uncompressed_bytes</span> <span class="o">=</span> <span class="n">max_output_chunk_uncompressed_bytes</span>
+</span><span id="__span-0-212"><a id="__codelineno-0-212" name="__codelineno-0-212"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">max_output_chunk_rows</span> <span class="o">=</span> <span class="n">max_output_chunk_rows</span>
+</span><span id="__span-0-213"><a id="__codelineno-0-213" name="__codelineno-0-213"></a>
+</span><span id="__span-0-214"><a id="__codelineno-0-214" name="__codelineno-0-214"></a>        <span class="c1"># Timer for statistics</span>
+</span><span id="__span-0-215"><a id="__codelineno-0-215" name="__codelineno-0-215"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">start_time</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span>
+</span><span id="__span-0-216"><a id="__codelineno-0-216" name="__codelineno-0-216"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">end_time</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-217"><a id="__codelineno-0-217" name="__codelineno-0-217"></a>
+</span><span id="__span-0-218"><a id="__codelineno-0-218" name="__codelineno-0-218"></a>        <span class="c1"># Generate config from dict</span>
+</span><span id="__span-0-219"><a id="__codelineno-0-219" name="__codelineno-0-219"></a>        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">config</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span>
+</span><span id="__span-0-220"><a id="__codelineno-0-220" name="__codelineno-0-220"></a>            <span class="n">config</span> <span class="o">=</span> <span class="n">Config</span><span class="p">(</span><span class="o">**</span><span class="n">config</span><span class="p">)</span>
+</span><span id="__span-0-221"><a id="__codelineno-0-221" name="__codelineno-0-221"></a>
+</span><span id="__span-0-222"><a id="__codelineno-0-222" name="__codelineno-0-222"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">config</span> <span class="o">=</span> <span class="n">config</span>
+</span><span id="__span-0-223"><a id="__codelineno-0-223" name="__codelineno-0-223"></a>
+</span><span id="__span-0-224"><a id="__codelineno-0-224" name="__codelineno-0-224"></a>        <span class="c1"># Set kwargs</span>
+</span><span id="__span-0-225"><a id="__codelineno-0-225" name="__codelineno-0-225"></a>        <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
+</span><span id="__span-0-226"><a id="__codelineno-0-226" name="__codelineno-0-226"></a>            <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">k</span><span class="p">):</span>
+</span><span id="__span-0-227"><a id="__codelineno-0-227" name="__codelineno-0-227"></a>                <span class="nb">setattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">)</span>
+</span><span id="__span-0-228"><a id="__codelineno-0-228" name="__codelineno-0-228"></a>            <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-229"><a id="__codelineno-0-229" name="__codelineno-0-229"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">&quot;kwarg provided but not attribute of dataset class: </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
+</span><span id="__span-0-230"><a id="__codelineno-0-230" name="__codelineno-0-230"></a>
+</span><span id="__span-0-231"><a id="__codelineno-0-231" name="__codelineno-0-231"></a>    <span class="k">def</span> <span class="nf">get_source_id</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-232"><a id="__codelineno-0-232" name="__codelineno-0-232"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">SOURCE_ID</span><span class="p">:</span>
+</span><span id="__span-0-233"><a id="__codelineno-0-233" name="__codelineno-0-233"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">SOURCE_ID</span>
+</span><span id="__span-0-234"><a id="__codelineno-0-234" name="__codelineno-0-234"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-235"><a id="__codelineno-0-235" name="__codelineno-0-235"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span>
+</span><span id="__span-0-236"><a id="__codelineno-0-236" name="__codelineno-0-236"></a>
+</span><span id="__span-0-237"><a id="__codelineno-0-237" name="__codelineno-0-237"></a>    <span class="k">def</span> <span class="nf">get_language_code</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">unknown</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;unknown&quot;</span><span class="p">,</span> <span class="n">mixed</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;mixed&quot;</span><span class="p">):</span>
+</span><span id="__span-0-238"><a id="__codelineno-0-238" name="__codelineno-0-238"></a>        <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">LANGUAGES</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
+</span><span id="__span-0-239"><a id="__codelineno-0-239" name="__codelineno-0-239"></a>            <span class="n">lang</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">LANGUAGES</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
+</span><span id="__span-0-240"><a id="__codelineno-0-240" name="__codelineno-0-240"></a>        <span class="k">elif</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">LANGUAGES</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+</span><span id="__span-0-241"><a id="__codelineno-0-241" name="__codelineno-0-241"></a>            <span class="n">lang</span> <span class="o">=</span> <span class="n">unknown</span>
+</span><span id="__span-0-242"><a id="__codelineno-0-242" name="__codelineno-0-242"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-243"><a id="__codelineno-0-243" name="__codelineno-0-243"></a>            <span class="n">lang</span> <span class="o">=</span> <span class="n">mixed</span>
+</span><span id="__span-0-244"><a id="__codelineno-0-244" name="__codelineno-0-244"></a>
+</span><span id="__span-0-245"><a id="__codelineno-0-245" name="__codelineno-0-245"></a>        <span class="k">return</span> <span class="n">lang</span>
+</span><span id="__span-0-246"><a id="__codelineno-0-246" name="__codelineno-0-246"></a>
+</span><span id="__span-0-247"><a id="__codelineno-0-247" name="__codelineno-0-247"></a>    <span class="k">def</span> <span class="nf">get_output_text_field</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-248"><a id="__codelineno-0-248" name="__codelineno-0-248"></a>        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_text_field</span>
+</span><span id="__span-0-249"><a id="__codelineno-0-249" name="__codelineno-0-249"></a>
+</span><span id="__span-0-250"><a id="__codelineno-0-250" name="__codelineno-0-250"></a>    <span class="k">def</span> <span class="nf">has_output_files</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">min_file_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
+</span><span id="__span-0-251"><a id="__codelineno-0-251" name="__codelineno-0-251"></a>        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">has_single_output_file</span><span class="p">(</span>
+</span><span id="__span-0-252"><a id="__codelineno-0-252" name="__codelineno-0-252"></a>            <span class="n">min_file_size</span><span class="o">=</span><span class="n">min_file_size</span><span class="p">,</span> <span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span>
+</span><span id="__span-0-253"><a id="__codelineno-0-253" name="__codelineno-0-253"></a>        <span class="p">)</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">has_chunked_output_files</span><span class="p">(</span><span class="n">min_file_size</span><span class="o">=</span><span class="n">min_file_size</span><span class="p">,</span> <span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">)</span>
+</span><span id="__span-0-254"><a id="__codelineno-0-254" name="__codelineno-0-254"></a>
+</span><span id="__span-0-255"><a id="__codelineno-0-255" name="__codelineno-0-255"></a>    <span class="k">def</span> <span class="nf">has_single_output_file</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">min_file_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
+</span><span id="__span-0-256"><a id="__codelineno-0-256" name="__codelineno-0-256"></a>        <span class="n">fp</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_single_output_file_path</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">)</span>
+</span><span id="__span-0-257"><a id="__codelineno-0-257" name="__codelineno-0-257"></a>
+</span><span id="__span-0-258"><a id="__codelineno-0-258" name="__codelineno-0-258"></a>        <span class="k">return</span> <span class="n">fp</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">fp</span><span class="p">)</span> <span class="ow">and</span> <span class="n">os</span><span class="o">.</span><span class="n">stat</span><span class="p">(</span><span class="n">fp</span><span class="p">)</span><span class="o">.</span><span class="n">st_size</span> <span class="o">&gt;=</span> <span class="n">min_file_size</span>
+</span><span id="__span-0-259"><a id="__codelineno-0-259" name="__codelineno-0-259"></a>
+</span><span id="__span-0-260"><a id="__codelineno-0-260" name="__codelineno-0-260"></a>    <span class="k">def</span> <span class="nf">has_chunked_output_files</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">min_file_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
+</span><span id="__span-0-261"><a id="__codelineno-0-261" name="__codelineno-0-261"></a>        <span class="k">for</span> <span class="n">fp</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_chunked_output_file_paths</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">):</span>
+</span><span id="__span-0-262"><a id="__codelineno-0-262" name="__codelineno-0-262"></a>            <span class="k">if</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">fp</span><span class="p">)</span> <span class="ow">and</span> <span class="n">os</span><span class="o">.</span><span class="n">stat</span><span class="p">(</span><span class="n">fp</span><span class="p">)</span><span class="o">.</span><span class="n">st_size</span> <span class="o">&gt;=</span> <span class="n">min_file_size</span><span class="p">:</span>
+</span><span id="__span-0-263"><a id="__codelineno-0-263" name="__codelineno-0-263"></a>                <span class="k">return</span> <span class="kc">True</span>
+</span><span id="__span-0-264"><a id="__codelineno-0-264" name="__codelineno-0-264"></a>            <span class="k">break</span>
+</span><span id="__span-0-265"><a id="__codelineno-0-265" name="__codelineno-0-265"></a>
+</span><span id="__span-0-266"><a id="__codelineno-0-266" name="__codelineno-0-266"></a>        <span class="k">return</span> <span class="kc">False</span>
+</span><span id="__span-0-267"><a id="__codelineno-0-267" name="__codelineno-0-267"></a>
+</span><span id="__span-0-268"><a id="__codelineno-0-268" name="__codelineno-0-268"></a>    <span class="k">def</span> <span class="nf">get_output_file_paths</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">single</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">chunked</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
+</span><span id="__span-0-269"><a id="__codelineno-0-269" name="__codelineno-0-269"></a>        <span class="k">if</span> <span class="n">single</span><span class="p">:</span>
+</span><span id="__span-0-270"><a id="__codelineno-0-270" name="__codelineno-0-270"></a>            <span class="k">return</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">get_single_output_file_path</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">)]</span>
+</span><span id="__span-0-271"><a id="__codelineno-0-271" name="__codelineno-0-271"></a>        <span class="k">elif</span> <span class="n">chunked</span><span class="p">:</span>
+</span><span id="__span-0-272"><a id="__codelineno-0-272" name="__codelineno-0-272"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_chunked_output_file_paths</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">)</span>
+</span><span id="__span-0-273"><a id="__codelineno-0-273" name="__codelineno-0-273"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-274"><a id="__codelineno-0-274" name="__codelineno-0-274"></a>            <span class="c1"># auto determine based on existing files</span>
+</span><span id="__span-0-275"><a id="__codelineno-0-275" name="__codelineno-0-275"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">has_chunked_output_files</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">):</span>
+</span><span id="__span-0-276"><a id="__codelineno-0-276" name="__codelineno-0-276"></a>                <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_chunked_output_file_paths</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">)</span>
+</span><span id="__span-0-277"><a id="__codelineno-0-277" name="__codelineno-0-277"></a>            <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-278"><a id="__codelineno-0-278" name="__codelineno-0-278"></a>                <span class="k">return</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">get_single_output_file_path</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">)]</span>
+</span><span id="__span-0-279"><a id="__codelineno-0-279" name="__codelineno-0-279"></a>
+</span><span id="__span-0-280"><a id="__codelineno-0-280" name="__codelineno-0-280"></a>    <span class="k">def</span> <span class="nf">get_output_file_path</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-281"><a id="__codelineno-0-281" name="__codelineno-0-281"></a>        <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;Use `get_output_file_paths` instead!&quot;</span><span class="p">)</span>
+</span><span id="__span-0-282"><a id="__codelineno-0-282" name="__codelineno-0-282"></a>
+</span><span id="__span-0-283"><a id="__codelineno-0-283" name="__codelineno-0-283"></a>    <span class="k">def</span> <span class="nf">get_output_extension</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">with_dot</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">shuffled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+</span><span id="__span-0-284"><a id="__codelineno-0-284" name="__codelineno-0-284"></a>        <span class="n">extension</span> <span class="o">=</span> <span class="s2">&quot;.&quot;</span> <span class="k">if</span> <span class="n">with_dot</span> <span class="k">else</span> <span class="s2">&quot;&quot;</span>
+</span><span id="__span-0-285"><a id="__codelineno-0-285" name="__codelineno-0-285"></a>
+</span><span id="__span-0-286"><a id="__codelineno-0-286" name="__codelineno-0-286"></a>        <span class="k">if</span> <span class="n">shuffled</span><span class="p">:</span>
+</span><span id="__span-0-287"><a id="__codelineno-0-287" name="__codelineno-0-287"></a>            <span class="n">extension</span> <span class="o">+=</span> <span class="s2">&quot;shuffled.&quot;</span>
+</span><span id="__span-0-288"><a id="__codelineno-0-288" name="__codelineno-0-288"></a>
+</span><span id="__span-0-289"><a id="__codelineno-0-289" name="__codelineno-0-289"></a>        <span class="n">extension</span> <span class="o">+=</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span>
+</span><span id="__span-0-290"><a id="__codelineno-0-290" name="__codelineno-0-290"></a>
+</span><span id="__span-0-291"><a id="__codelineno-0-291" name="__codelineno-0-291"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">==</span> <span class="s2">&quot;jsonl&quot;</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_compression</span> <span class="o">==</span> <span class="s2">&quot;gzip&quot;</span><span class="p">:</span>
+</span><span id="__span-0-292"><a id="__codelineno-0-292" name="__codelineno-0-292"></a>            <span class="c1"># Simply add &quot;.gz&quot; as extension as smart_open will take about the compression</span>
+</span><span id="__span-0-293"><a id="__codelineno-0-293" name="__codelineno-0-293"></a>            <span class="n">extension</span> <span class="o">+=</span> <span class="s2">&quot;.gz&quot;</span>
+</span><span id="__span-0-294"><a id="__codelineno-0-294" name="__codelineno-0-294"></a>
+</span><span id="__span-0-295"><a id="__codelineno-0-295" name="__codelineno-0-295"></a>        <span class="k">return</span> <span class="n">extension</span>
+</span><span id="__span-0-296"><a id="__codelineno-0-296" name="__codelineno-0-296"></a>
+</span><span id="__span-0-297"><a id="__codelineno-0-297" name="__codelineno-0-297"></a>    <span class="k">def</span> <span class="nf">get_output_dir</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
+</span><span id="__span-0-298"><a id="__codelineno-0-298" name="__codelineno-0-298"></a>        <span class="k">if</span> <span class="n">shuffled</span><span class="p">:</span>
+</span><span id="__span-0-299"><a id="__codelineno-0-299" name="__codelineno-0-299"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">shuffled_datasets_dir</span><span class="p">:</span>
+</span><span id="__span-0-300"><a id="__codelineno-0-300" name="__codelineno-0-300"></a>                <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">shuffled_datasets_dir</span>
+</span><span id="__span-0-301"><a id="__codelineno-0-301" name="__codelineno-0-301"></a>            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;shuffled_datasets_dir is not set&quot;</span><span class="p">)</span>
+</span><span id="__span-0-302"><a id="__codelineno-0-302" name="__codelineno-0-302"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-303"><a id="__codelineno-0-303" name="__codelineno-0-303"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">text_datasets_dir</span>
+</span><span id="__span-0-304"><a id="__codelineno-0-304" name="__codelineno-0-304"></a>
+</span><span id="__span-0-305"><a id="__codelineno-0-305" name="__codelineno-0-305"></a>    <span class="k">def</span> <span class="nf">get_single_output_file_path</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+</span><span id="__span-0-306"><a id="__codelineno-0-306" name="__codelineno-0-306"></a>        <span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span>
+</span><span id="__span-0-307"><a id="__codelineno-0-307" name="__codelineno-0-307"></a>            <span class="bp">self</span><span class="o">.</span><span class="n">get_output_dir</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_output_extension</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">)</span>
+</span><span id="__span-0-308"><a id="__codelineno-0-308" name="__codelineno-0-308"></a>        <span class="p">)</span>
+</span><span id="__span-0-309"><a id="__codelineno-0-309" name="__codelineno-0-309"></a>
+</span><span id="__span-0-310"><a id="__codelineno-0-310" name="__codelineno-0-310"></a>    <span class="k">def</span> <span class="nf">get_chunked_output_file_paths</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
+</span><span id="__span-0-311"><a id="__codelineno-0-311" name="__codelineno-0-311"></a>        <span class="n">output_dir_path</span> <span class="o">=</span> <span class="n">Path</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_dir</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">))</span>
+</span><span id="__span-0-312"><a id="__codelineno-0-312" name="__codelineno-0-312"></a>
+</span><span id="__span-0-313"><a id="__codelineno-0-313" name="__codelineno-0-313"></a>        <span class="k">return</span> <span class="nb">list</span><span class="p">(</span>
+</span><span id="__span-0-314"><a id="__codelineno-0-314" name="__codelineno-0-314"></a>            <span class="n">output_dir_path</span><span class="o">.</span><span class="n">glob</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span><span class="si">}</span><span class="s2">.part-*-of-*</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_extension</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">)</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-315"><a id="__codelineno-0-315" name="__codelineno-0-315"></a>        <span class="p">)</span>
+</span><span id="__span-0-316"><a id="__codelineno-0-316" name="__codelineno-0-316"></a>
+</span><span id="__span-0-317"><a id="__codelineno-0-317" name="__codelineno-0-317"></a>    <span class="k">def</span> <span class="nf">get_chunked_output_file_path</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">part</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">total_parts</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+</span><span id="__span-0-318"><a id="__codelineno-0-318" name="__codelineno-0-318"></a>        <span class="k">if</span> <span class="n">total_parts</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-319"><a id="__codelineno-0-319" name="__codelineno-0-319"></a>            <span class="n">fn</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span><span class="si">}</span><span class="s2">.part-</span><span class="si">{</span><span class="n">part</span><span class="si">:</span><span class="s2">04d</span><span class="si">}{</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_extension</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">)</span><span class="si">}</span><span class="s2">&quot;</span>
+</span><span id="__span-0-320"><a id="__codelineno-0-320" name="__codelineno-0-320"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-321"><a id="__codelineno-0-321" name="__codelineno-0-321"></a>            <span class="n">fn</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span><span class="si">}</span><span class="s2">.part-</span><span class="si">{</span><span class="n">part</span><span class="si">:</span><span class="s2">04d</span><span class="si">}</span><span class="s2">-of-</span><span class="si">{</span><span class="n">total_parts</span><span class="si">:</span><span class="s2">04d</span><span class="si">}{</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_extension</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">)</span><span class="si">}</span><span class="s2">&quot;</span>
+</span><span id="__span-0-322"><a id="__codelineno-0-322" name="__codelineno-0-322"></a>
+</span><span id="__span-0-323"><a id="__codelineno-0-323" name="__codelineno-0-323"></a>        <span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_dir</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">),</span> <span class="n">fn</span><span class="p">)</span>
+</span><span id="__span-0-324"><a id="__codelineno-0-324" name="__codelineno-0-324"></a>
+</span><span id="__span-0-325"><a id="__codelineno-0-325" name="__codelineno-0-325"></a>    <span class="k">def</span> <span class="nf">get_single_or_chunked_output_file_path</span><span class="p">(</span>
+</span><span id="__span-0-326"><a id="__codelineno-0-326" name="__codelineno-0-326"></a>        <span class="bp">self</span><span class="p">,</span> <span class="n">part</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">total_parts</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span>
+</span><span id="__span-0-327"><a id="__codelineno-0-327" name="__codelineno-0-327"></a>    <span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+</span><span id="__span-0-328"><a id="__codelineno-0-328" name="__codelineno-0-328"></a>        <span class="k">if</span> <span class="n">part</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-329"><a id="__codelineno-0-329" name="__codelineno-0-329"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_single_output_file_path</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">)</span>
+</span><span id="__span-0-330"><a id="__codelineno-0-330" name="__codelineno-0-330"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-331"><a id="__codelineno-0-331" name="__codelineno-0-331"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_chunked_output_file_path</span><span class="p">(</span><span class="n">part</span><span class="p">,</span> <span class="n">total_parts</span><span class="p">,</span> <span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">)</span>
+</span><span id="__span-0-332"><a id="__codelineno-0-332" name="__codelineno-0-332"></a>
+</span><span id="__span-0-333"><a id="__codelineno-0-333" name="__codelineno-0-333"></a>    <span class="k">def</span> <span class="nf">filter_texts_or_documents</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">texts_or_documents</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Document</span><span class="p">]]):</span>
+</span><span id="__span-0-334"><a id="__codelineno-0-334" name="__codelineno-0-334"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">use_documents</span><span class="p">:</span>
+</span><span id="__span-0-335"><a id="__codelineno-0-335" name="__codelineno-0-335"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">filter_documents</span><span class="p">(</span><span class="n">texts_or_documents</span><span class="p">)</span>
+</span><span id="__span-0-336"><a id="__codelineno-0-336" name="__codelineno-0-336"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-337"><a id="__codelineno-0-337" name="__codelineno-0-337"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">filter_texts</span><span class="p">(</span><span class="n">texts_or_documents</span><span class="p">)</span>
+</span><span id="__span-0-338"><a id="__codelineno-0-338" name="__codelineno-0-338"></a>
+</span><span id="__span-0-339"><a id="__codelineno-0-339" name="__codelineno-0-339"></a>    <span class="k">def</span> <span class="nf">filter_documents</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">documents</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Document</span><span class="p">]):</span>
+</span><span id="__span-0-340"><a id="__codelineno-0-340" name="__codelineno-0-340"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;Applies basic filtering on the texts before saving&quot;&quot;&quot;</span>
+</span><span id="__span-0-341"><a id="__codelineno-0-341" name="__codelineno-0-341"></a>        <span class="k">for</span> <span class="n">doc</span> <span class="ow">in</span> <span class="n">documents</span><span class="p">:</span>
+</span><span id="__span-0-342"><a id="__codelineno-0-342" name="__codelineno-0-342"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">min_length</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">doc</span><span class="o">.</span><span class="n">text</span><span class="p">)</span> <span class="o">&lt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">min_length</span><span class="p">:</span>
+</span><span id="__span-0-343"><a id="__codelineno-0-343" name="__codelineno-0-343"></a>                <span class="c1"># skip because of short text length</span>
+</span><span id="__span-0-344"><a id="__codelineno-0-344" name="__codelineno-0-344"></a>                <span class="bp">self</span><span class="o">.</span><span class="n">counter</span><span class="o">.</span><span class="n">update</span><span class="p">({</span><span class="s2">&quot;filtered_short_text&quot;</span><span class="p">:</span> <span class="mi">1</span><span class="p">})</span>
+</span><span id="__span-0-345"><a id="__codelineno-0-345" name="__codelineno-0-345"></a>                <span class="k">continue</span>
+</span><span id="__span-0-346"><a id="__codelineno-0-346" name="__codelineno-0-346"></a>
+</span><span id="__span-0-347"><a id="__codelineno-0-347" name="__codelineno-0-347"></a>            <span class="k">yield</span> <span class="n">doc</span>
+</span><span id="__span-0-348"><a id="__codelineno-0-348" name="__codelineno-0-348"></a>
+</span><span id="__span-0-349"><a id="__codelineno-0-349" name="__codelineno-0-349"></a>    <span class="k">def</span> <span class="nf">filter_texts</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">texts</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">str</span><span class="p">]):</span>
+</span><span id="__span-0-350"><a id="__codelineno-0-350" name="__codelineno-0-350"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;Applies basic filtering on the texts before saving&quot;&quot;&quot;</span>
+</span><span id="__span-0-351"><a id="__codelineno-0-351" name="__codelineno-0-351"></a>        <span class="k">for</span> <span class="n">text</span> <span class="ow">in</span> <span class="n">texts</span><span class="p">:</span>
+</span><span id="__span-0-352"><a id="__codelineno-0-352" name="__codelineno-0-352"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">min_length</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">text</span><span class="p">)</span> <span class="o">&lt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">min_length</span><span class="p">:</span>
+</span><span id="__span-0-353"><a id="__codelineno-0-353" name="__codelineno-0-353"></a>                <span class="c1"># skip because of short text length</span>
+</span><span id="__span-0-354"><a id="__codelineno-0-354" name="__codelineno-0-354"></a>                <span class="bp">self</span><span class="o">.</span><span class="n">counter</span><span class="o">.</span><span class="n">update</span><span class="p">({</span><span class="s2">&quot;filtered_short_text&quot;</span><span class="p">:</span> <span class="mi">1</span><span class="p">})</span>
+</span><span id="__span-0-355"><a id="__codelineno-0-355" name="__codelineno-0-355"></a>                <span class="k">continue</span>
+</span><span id="__span-0-356"><a id="__codelineno-0-356" name="__codelineno-0-356"></a>
+</span><span id="__span-0-357"><a id="__codelineno-0-357" name="__codelineno-0-357"></a>            <span class="k">yield</span> <span class="n">text</span>
+</span><span id="__span-0-358"><a id="__codelineno-0-358" name="__codelineno-0-358"></a>
+</span><span id="__span-0-359"><a id="__codelineno-0-359" name="__codelineno-0-359"></a>    <span class="k">def</span> <span class="nf">remove_texts</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-360"><a id="__codelineno-0-360" name="__codelineno-0-360"></a>        <span class="k">for</span> <span class="n">fp</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">():</span>
+</span><span id="__span-0-361"><a id="__codelineno-0-361" name="__codelineno-0-361"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Removing </span><span class="si">{</span><span class="n">fp</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-362"><a id="__codelineno-0-362" name="__codelineno-0-362"></a>            <span class="n">os</span><span class="o">.</span><span class="n">remove</span><span class="p">(</span><span class="n">fp</span><span class="p">)</span>
+</span><span id="__span-0-363"><a id="__codelineno-0-363" name="__codelineno-0-363"></a>
+</span><span id="__span-0-364"><a id="__codelineno-0-364" name="__codelineno-0-364"></a>    <span class="k">def</span> <span class="nf">save_texts</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">texts</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">append</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">):</span>
+</span><span id="__span-0-365"><a id="__codelineno-0-365" name="__codelineno-0-365"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;Save texts in different formats&quot;&quot;&quot;</span>
+</span><span id="__span-0-366"><a id="__codelineno-0-366" name="__codelineno-0-366"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">has_output_files</span><span class="p">()</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">override_output</span><span class="p">:</span>
+</span><span id="__span-0-367"><a id="__codelineno-0-367" name="__codelineno-0-367"></a>            <span class="k">raise</span> <span class="ne">FileExistsError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Output exists already (override not enabled): </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">()</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-368"><a id="__codelineno-0-368" name="__codelineno-0-368"></a>
+</span><span id="__span-0-369"><a id="__codelineno-0-369" name="__codelineno-0-369"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">==</span> <span class="s2">&quot;jsonl&quot;</span><span class="p">:</span>
+</span><span id="__span-0-370"><a id="__codelineno-0-370" name="__codelineno-0-370"></a>            <span class="n">docs_count</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">save_texts_to_jsonl</span><span class="p">(</span><span class="n">texts</span><span class="p">,</span> <span class="n">append</span><span class="o">=</span><span class="n">append</span><span class="p">)</span>
+</span><span id="__span-0-371"><a id="__codelineno-0-371" name="__codelineno-0-371"></a>
+</span><span id="__span-0-372"><a id="__codelineno-0-372" name="__codelineno-0-372"></a>        <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">==</span> <span class="s2">&quot;parquet&quot;</span><span class="p">:</span>
+</span><span id="__span-0-373"><a id="__codelineno-0-373" name="__codelineno-0-373"></a>            <span class="k">if</span> <span class="n">append</span><span class="p">:</span>
+</span><span id="__span-0-374"><a id="__codelineno-0-374" name="__codelineno-0-374"></a>                <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;Appending is not supported by parquet output format&quot;</span><span class="p">)</span>
+</span><span id="__span-0-375"><a id="__codelineno-0-375" name="__codelineno-0-375"></a>
+</span><span id="__span-0-376"><a id="__codelineno-0-376" name="__codelineno-0-376"></a>            <span class="n">docs_count</span><span class="p">,</span> <span class="n">saved_chunks</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">save_texts_to_parquet</span><span class="p">(</span><span class="n">texts</span><span class="p">)</span>
+</span><span id="__span-0-377"><a id="__codelineno-0-377" name="__codelineno-0-377"></a>
+</span><span id="__span-0-378"><a id="__codelineno-0-378" name="__codelineno-0-378"></a>            <span class="bp">self</span><span class="o">.</span><span class="n">counter</span><span class="o">.</span><span class="n">update</span><span class="p">({</span><span class="s2">&quot;saved_chunks&quot;</span><span class="p">:</span> <span class="n">saved_chunks</span><span class="p">})</span>
+</span><span id="__span-0-379"><a id="__codelineno-0-379" name="__codelineno-0-379"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-380"><a id="__codelineno-0-380" name="__codelineno-0-380"></a>            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Unsupported output format: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">output_format</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-381"><a id="__codelineno-0-381" name="__codelineno-0-381"></a>
+</span><span id="__span-0-382"><a id="__codelineno-0-382" name="__codelineno-0-382"></a>        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Documents saved: </span><span class="si">{</span><span class="n">docs_count</span><span class="si">:</span><span class="s2">,</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-383"><a id="__codelineno-0-383" name="__codelineno-0-383"></a>
+</span><span id="__span-0-384"><a id="__codelineno-0-384" name="__codelineno-0-384"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">counter</span><span class="o">.</span><span class="n">update</span><span class="p">({</span><span class="s2">&quot;docs_count&quot;</span><span class="p">:</span> <span class="n">docs_count</span><span class="p">})</span>
+</span><span id="__span-0-385"><a id="__codelineno-0-385" name="__codelineno-0-385"></a>
+</span><span id="__span-0-386"><a id="__codelineno-0-386" name="__codelineno-0-386"></a>        <span class="k">if</span> <span class="n">docs_count</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+</span><span id="__span-0-387"><a id="__codelineno-0-387" name="__codelineno-0-387"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">&quot;No documents have been saved!&quot;</span><span class="p">)</span>
+</span><span id="__span-0-388"><a id="__codelineno-0-388" name="__codelineno-0-388"></a>
+</span><span id="__span-0-389"><a id="__codelineno-0-389" name="__codelineno-0-389"></a>            <span class="c1"># delete empty output file</span>
+</span><span id="__span-0-390"><a id="__codelineno-0-390" name="__codelineno-0-390"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">has_output_files</span><span class="p">():</span>
+</span><span id="__span-0-391"><a id="__codelineno-0-391" name="__codelineno-0-391"></a>                <span class="bp">self</span><span class="o">.</span><span class="n">remove_texts</span><span class="p">()</span>
+</span><span id="__span-0-392"><a id="__codelineno-0-392" name="__codelineno-0-392"></a>
+</span><span id="__span-0-393"><a id="__codelineno-0-393" name="__codelineno-0-393"></a>        <span class="k">return</span> <span class="n">docs_count</span>
+</span><span id="__span-0-394"><a id="__codelineno-0-394" name="__codelineno-0-394"></a>
+</span><span id="__span-0-395"><a id="__codelineno-0-395" name="__codelineno-0-395"></a>    <span class="k">def</span> <span class="nf">save_texts_to_parquet</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">texts</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">file_path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">apply_filter</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">):</span>
+</span><span id="__span-0-396"><a id="__codelineno-0-396" name="__codelineno-0-396"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;Save text in parquet (single column schema, in batches)&quot;&quot;&quot;</span>
+</span><span id="__span-0-397"><a id="__codelineno-0-397" name="__codelineno-0-397"></a>        <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">==</span> <span class="s2">&quot;parquet&quot;</span>
+</span><span id="__span-0-398"><a id="__codelineno-0-398" name="__codelineno-0-398"></a>
+</span><span id="__span-0-399"><a id="__codelineno-0-399" name="__codelineno-0-399"></a>        <span class="k">if</span> <span class="n">file_path</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-400"><a id="__codelineno-0-400" name="__codelineno-0-400"></a>            <span class="n">file_path</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">(</span><span class="n">single</span><span class="o">=</span><span class="kc">True</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
+</span><span id="__span-0-401"><a id="__codelineno-0-401" name="__codelineno-0-401"></a>
+</span><span id="__span-0-402"><a id="__codelineno-0-402" name="__codelineno-0-402"></a>        <span class="k">if</span> <span class="n">apply_filter</span><span class="p">:</span>
+</span><span id="__span-0-403"><a id="__codelineno-0-403" name="__codelineno-0-403"></a>            <span class="n">texts</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">filter_texts_or_documents</span><span class="p">(</span><span class="n">texts</span><span class="p">)</span>
+</span><span id="__span-0-404"><a id="__codelineno-0-404" name="__codelineno-0-404"></a>
+</span><span id="__span-0-405"><a id="__codelineno-0-405" name="__codelineno-0-405"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">use_documents</span><span class="p">:</span>
+</span><span id="__span-0-406"><a id="__codelineno-0-406" name="__codelineno-0-406"></a>            <span class="c1"># document schema</span>
+</span><span id="__span-0-407"><a id="__codelineno-0-407" name="__codelineno-0-407"></a>            <span class="n">schema</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_document_schema</span><span class="p">()</span><span class="o">.</span><span class="n">get_pa_schema</span><span class="p">()</span>
+</span><span id="__span-0-408"><a id="__codelineno-0-408" name="__codelineno-0-408"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-409"><a id="__codelineno-0-409" name="__codelineno-0-409"></a>            <span class="c1"># text-only schema</span>
+</span><span id="__span-0-410"><a id="__codelineno-0-410" name="__codelineno-0-410"></a>            <span class="n">schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">(</span>
+</span><span id="__span-0-411"><a id="__codelineno-0-411" name="__codelineno-0-411"></a>                <span class="p">[</span>
+</span><span id="__span-0-412"><a id="__codelineno-0-412" name="__codelineno-0-412"></a>                    <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_text_field</span><span class="p">(),</span> <span class="n">pa</span><span class="o">.</span><span class="n">string</span><span class="p">()),</span>
+</span><span id="__span-0-413"><a id="__codelineno-0-413" name="__codelineno-0-413"></a>                <span class="p">]</span>
+</span><span id="__span-0-414"><a id="__codelineno-0-414" name="__codelineno-0-414"></a>            <span class="p">)</span>
+</span><span id="__span-0-415"><a id="__codelineno-0-415" name="__codelineno-0-415"></a>
+</span><span id="__span-0-416"><a id="__codelineno-0-416" name="__codelineno-0-416"></a>        <span class="c1"># Max. chunk size is multiplied with this factor</span>
+</span><span id="__span-0-417"><a id="__codelineno-0-417" name="__codelineno-0-417"></a>        <span class="c1"># (to account for inaccurate chunk sizes due to batching)</span>
+</span><span id="__span-0-418"><a id="__codelineno-0-418" name="__codelineno-0-418"></a>        <span class="n">safety_factor</span> <span class="o">=</span> <span class="mf">0.975</span>
+</span><span id="__span-0-419"><a id="__codelineno-0-419" name="__codelineno-0-419"></a>
+</span><span id="__span-0-420"><a id="__codelineno-0-420" name="__codelineno-0-420"></a>        <span class="c1"># Save as Parquet file</span>
+</span><span id="__span-0-421"><a id="__codelineno-0-421" name="__codelineno-0-421"></a>        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Writing parquet output (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">output_batch_size</span><span class="si">=}</span><span class="s2">; </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="si">=}</span><span class="s2">; </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">output_compression</span><span class="si">=}</span><span class="s2">)&quot;</span><span class="p">)</span>
+</span><span id="__span-0-422"><a id="__codelineno-0-422" name="__codelineno-0-422"></a>
+</span><span id="__span-0-423"><a id="__codelineno-0-423" name="__codelineno-0-423"></a>        <span class="n">saved_docs</span><span class="p">,</span> <span class="n">saved_chunks</span> <span class="o">=</span> <span class="n">save_texts_to_parquet_chunks</span><span class="p">(</span>
+</span><span id="__span-0-424"><a id="__codelineno-0-424" name="__codelineno-0-424"></a>            <span class="n">texts</span><span class="o">=</span><span class="n">texts</span><span class="p">,</span>
+</span><span id="__span-0-425"><a id="__codelineno-0-425" name="__codelineno-0-425"></a>            <span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">,</span>
+</span><span id="__span-0-426"><a id="__codelineno-0-426" name="__codelineno-0-426"></a>            <span class="n">max_chunk_uncompressed_bytes</span><span class="o">=</span><span class="p">(</span>
+</span><span id="__span-0-427"><a id="__codelineno-0-427" name="__codelineno-0-427"></a>                <span class="bp">self</span><span class="o">.</span><span class="n">max_output_chunk_uncompressed_bytes</span> <span class="o">*</span> <span class="n">safety_factor</span>
+</span><span id="__span-0-428"><a id="__codelineno-0-428" name="__codelineno-0-428"></a>                <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_output_chunk_uncompressed_bytes</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
+</span><span id="__span-0-429"><a id="__codelineno-0-429" name="__codelineno-0-429"></a>                <span class="k">else</span> <span class="kc">None</span>
+</span><span id="__span-0-430"><a id="__codelineno-0-430" name="__codelineno-0-430"></a>            <span class="p">),</span>
+</span><span id="__span-0-431"><a id="__codelineno-0-431" name="__codelineno-0-431"></a>            <span class="n">max_chunk_rows</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">max_output_chunk_rows</span><span class="p">,</span>
+</span><span id="__span-0-432"><a id="__codelineno-0-432" name="__codelineno-0-432"></a>            <span class="n">output_path_func</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">get_single_or_chunked_output_file_path</span><span class="p">,</span>
+</span><span id="__span-0-433"><a id="__codelineno-0-433" name="__codelineno-0-433"></a>            <span class="n">compression</span><span class="o">=</span><span class="n">get_parquet_compression</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">output_compression</span><span class="p">),</span>
+</span><span id="__span-0-434"><a id="__codelineno-0-434" name="__codelineno-0-434"></a>            <span class="n">batch_size</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">output_batch_size</span><span class="p">,</span>
+</span><span id="__span-0-435"><a id="__codelineno-0-435" name="__codelineno-0-435"></a>            <span class="n">print_write_progress</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">print_write_progress</span><span class="p">,</span>
+</span><span id="__span-0-436"><a id="__codelineno-0-436" name="__codelineno-0-436"></a>            <span class="n">limit</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">,</span>
+</span><span id="__span-0-437"><a id="__codelineno-0-437" name="__codelineno-0-437"></a>        <span class="p">)</span>
+</span><span id="__span-0-438"><a id="__codelineno-0-438" name="__codelineno-0-438"></a>
+</span><span id="__span-0-439"><a id="__codelineno-0-439" name="__codelineno-0-439"></a>        <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">texts</span><span class="p">,</span> <span class="s2">&quot;terminate&quot;</span><span class="p">):</span>
+</span><span id="__span-0-440"><a id="__codelineno-0-440" name="__codelineno-0-440"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Killing all remaining workers, if any (iterator end)&quot;</span><span class="p">)</span>
+</span><span id="__span-0-441"><a id="__codelineno-0-441" name="__codelineno-0-441"></a>            <span class="n">texts</span><span class="o">.</span><span class="n">terminate</span><span class="p">()</span>
+</span><span id="__span-0-442"><a id="__codelineno-0-442" name="__codelineno-0-442"></a>
+</span><span id="__span-0-443"><a id="__codelineno-0-443" name="__codelineno-0-443"></a>        <span class="k">return</span> <span class="n">saved_docs</span><span class="p">,</span> <span class="n">saved_chunks</span>
+</span><span id="__span-0-444"><a id="__codelineno-0-444" name="__codelineno-0-444"></a>
+</span><span id="__span-0-445"><a id="__codelineno-0-445" name="__codelineno-0-445"></a>    <span class="k">def</span> <span class="nf">save_texts_to_jsonl</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">texts</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">append</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">):</span>
+</span><span id="__span-0-446"><a id="__codelineno-0-446" name="__codelineno-0-446"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;Write JSONL files to &lt;output_dir&gt;/&lt;DATASET_ID&gt;.jsonl</span>
+</span><span id="__span-0-447"><a id="__codelineno-0-447" name="__codelineno-0-447"></a><span class="sd">        (each line is a JSON object with &quot;doc&quot; field and text as plain text)</span>
+</span><span id="__span-0-448"><a id="__codelineno-0-448" name="__codelineno-0-448"></a><span class="sd">        &quot;&quot;&quot;</span>
+</span><span id="__span-0-449"><a id="__codelineno-0-449" name="__codelineno-0-449"></a>        <span class="n">mode</span> <span class="o">=</span> <span class="s2">&quot;a&quot;</span> <span class="k">if</span> <span class="n">append</span> <span class="k">else</span> <span class="s2">&quot;w&quot;</span>
+</span><span id="__span-0-450"><a id="__codelineno-0-450" name="__codelineno-0-450"></a>        <span class="n">fp</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">(</span><span class="n">single</span><span class="o">=</span><span class="kc">True</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
+</span><span id="__span-0-451"><a id="__codelineno-0-451" name="__codelineno-0-451"></a>
+</span><span id="__span-0-452"><a id="__codelineno-0-452" name="__codelineno-0-452"></a>        <span class="c1"># Save as JSONL</span>
+</span><span id="__span-0-453"><a id="__codelineno-0-453" name="__codelineno-0-453"></a>        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Writing JSONL output to </span><span class="si">{</span><span class="n">fp</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="n">mode</span><span class="si">=}</span><span class="s2">)&quot;</span><span class="p">)</span>
+</span><span id="__span-0-454"><a id="__codelineno-0-454" name="__codelineno-0-454"></a>
+</span><span id="__span-0-455"><a id="__codelineno-0-455" name="__codelineno-0-455"></a>        <span class="n">docs_count</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-456"><a id="__codelineno-0-456" name="__codelineno-0-456"></a>
+</span><span id="__span-0-457"><a id="__codelineno-0-457" name="__codelineno-0-457"></a>        <span class="k">with</span> <span class="n">smart_open</span><span class="p">(</span><span class="n">fp</span><span class="p">,</span> <span class="n">mode</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+</span><span id="__span-0-458"><a id="__codelineno-0-458" name="__codelineno-0-458"></a>            <span class="k">for</span> <span class="n">docs_count</span><span class="p">,</span> <span class="n">text</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">filter_texts</span><span class="p">(</span><span class="n">texts</span><span class="p">),</span> <span class="mi">1</span><span class="p">):</span>
+</span><span id="__span-0-459"><a id="__codelineno-0-459" name="__codelineno-0-459"></a>                <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">({</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_text_field</span><span class="p">():</span> <span class="n">text</span><span class="p">},</span> <span class="n">ensure_ascii</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">json_ensure_ascii</span><span class="p">)</span> <span class="o">+</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-460"><a id="__codelineno-0-460" name="__codelineno-0-460"></a>
+</span><span id="__span-0-461"><a id="__codelineno-0-461" name="__codelineno-0-461"></a>                <span class="k">if</span> <span class="n">docs_count</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="p">(</span><span class="n">docs_count</span> <span class="o">%</span> <span class="bp">self</span><span class="o">.</span><span class="n">print_write_progress</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+</span><span id="__span-0-462"><a id="__codelineno-0-462" name="__codelineno-0-462"></a>                    <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Written </span><span class="si">{</span><span class="n">docs_count</span><span class="si">:</span><span class="s2">,</span><span class="si">}</span><span class="s2"> docs ...&quot;</span><span class="p">)</span>
+</span><span id="__span-0-463"><a id="__codelineno-0-463" name="__codelineno-0-463"></a>
+</span><span id="__span-0-464"><a id="__codelineno-0-464" name="__codelineno-0-464"></a>                <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">limit</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="n">docs_count</span> <span class="o">&gt;=</span> <span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">:</span>
+</span><span id="__span-0-465"><a id="__codelineno-0-465" name="__codelineno-0-465"></a>                    <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Limit reached (</span><span class="si">{</span><span class="n">docs_count</span><span class="si">:</span><span class="s2">,</span><span class="si">}</span><span class="s2"> docs)&quot;</span><span class="p">)</span>
+</span><span id="__span-0-466"><a id="__codelineno-0-466" name="__codelineno-0-466"></a>
+</span><span id="__span-0-467"><a id="__codelineno-0-467" name="__codelineno-0-467"></a>                    <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">texts</span><span class="p">,</span> <span class="s2">&quot;terminate&quot;</span><span class="p">):</span>
+</span><span id="__span-0-468"><a id="__codelineno-0-468" name="__codelineno-0-468"></a>                        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Killing all remaining workers, if any&quot;</span><span class="p">)</span>
+</span><span id="__span-0-469"><a id="__codelineno-0-469" name="__codelineno-0-469"></a>                        <span class="n">texts</span><span class="o">.</span><span class="n">terminate</span><span class="p">()</span>
+</span><span id="__span-0-470"><a id="__codelineno-0-470" name="__codelineno-0-470"></a>                    <span class="k">break</span>
+</span><span id="__span-0-471"><a id="__codelineno-0-471" name="__codelineno-0-471"></a>
+</span><span id="__span-0-472"><a id="__codelineno-0-472" name="__codelineno-0-472"></a>        <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">texts</span><span class="p">,</span> <span class="s2">&quot;terminate&quot;</span><span class="p">):</span>
+</span><span id="__span-0-473"><a id="__codelineno-0-473" name="__codelineno-0-473"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Killing all remaining workers, if any (iterator end)&quot;</span><span class="p">)</span>
+</span><span id="__span-0-474"><a id="__codelineno-0-474" name="__codelineno-0-474"></a>            <span class="n">texts</span><span class="o">.</span><span class="n">terminate</span><span class="p">()</span>
+</span><span id="__span-0-475"><a id="__codelineno-0-475" name="__codelineno-0-475"></a>
+</span><span id="__span-0-476"><a id="__codelineno-0-476" name="__codelineno-0-476"></a>        <span class="k">return</span> <span class="n">docs_count</span>
+</span><span id="__span-0-477"><a id="__codelineno-0-477" name="__codelineno-0-477"></a>
+</span><span id="__span-0-478"><a id="__codelineno-0-478" name="__codelineno-0-478"></a>    <span class="k">def</span> <span class="nf">get_hf_auth_token</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-479"><a id="__codelineno-0-479" name="__codelineno-0-479"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">hf_auth_token</span><span class="p">:</span>
+</span><span id="__span-0-480"><a id="__codelineno-0-480" name="__codelineno-0-480"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">hf_auth_token</span>
+</span><span id="__span-0-481"><a id="__codelineno-0-481" name="__codelineno-0-481"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-482"><a id="__codelineno-0-482" name="__codelineno-0-482"></a>            <span class="n">env_token</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;HF_PASSWORD&quot;</span><span class="p">)</span>
+</span><span id="__span-0-483"><a id="__codelineno-0-483" name="__codelineno-0-483"></a>
+</span><span id="__span-0-484"><a id="__codelineno-0-484" name="__codelineno-0-484"></a>            <span class="k">if</span> <span class="n">env_token</span><span class="p">:</span>
+</span><span id="__span-0-485"><a id="__codelineno-0-485" name="__codelineno-0-485"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Using HF auth token from env var&quot;</span><span class="p">)</span>
+</span><span id="__span-0-486"><a id="__codelineno-0-486" name="__codelineno-0-486"></a>                <span class="k">return</span> <span class="n">env_token</span>
+</span><span id="__span-0-487"><a id="__codelineno-0-487" name="__codelineno-0-487"></a>
+</span><span id="__span-0-488"><a id="__codelineno-0-488" name="__codelineno-0-488"></a>        <span class="k">return</span> <span class="kc">None</span>
+</span><span id="__span-0-489"><a id="__codelineno-0-489" name="__codelineno-0-489"></a>
+</span><span id="__span-0-490"><a id="__codelineno-0-490" name="__codelineno-0-490"></a>    <span class="k">def</span> <span class="nf">get_local_dataset_dir</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-491"><a id="__codelineno-0-491" name="__codelineno-0-491"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="p">:</span>
+</span><span id="__span-0-492"><a id="__codelineno-0-492" name="__codelineno-0-492"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">local_dirs_by_dataset_id</span><span class="p">:</span>
+</span><span id="__span-0-493"><a id="__codelineno-0-493" name="__codelineno-0-493"></a>                <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">local_dirs_by_dataset_id</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span><span class="p">]</span>
+</span><span id="__span-0-494"><a id="__codelineno-0-494" name="__codelineno-0-494"></a>
+</span><span id="__span-0-495"><a id="__codelineno-0-495" name="__codelineno-0-495"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_source_id</span><span class="p">()</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">local_dirs_by_source_id</span><span class="p">:</span>
+</span><span id="__span-0-496"><a id="__codelineno-0-496" name="__codelineno-0-496"></a>                <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">local_dirs_by_source_id</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">get_source_id</span><span class="p">()]</span>
+</span><span id="__span-0-497"><a id="__codelineno-0-497" name="__codelineno-0-497"></a>
+</span><span id="__span-0-498"><a id="__codelineno-0-498" name="__codelineno-0-498"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">LOCAL_DIRS</span><span class="p">:</span>  <span class="c1"># TODO deprecated -&gt; use config instead!</span>
+</span><span id="__span-0-499"><a id="__codelineno-0-499" name="__codelineno-0-499"></a>            <span class="c1"># manually defined dataset directory</span>
+</span><span id="__span-0-500"><a id="__codelineno-0-500" name="__codelineno-0-500"></a>            <span class="k">return</span> <span class="n">get_path_by_system</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">LOCAL_DIRS</span><span class="p">)</span>
+</span><span id="__span-0-501"><a id="__codelineno-0-501" name="__codelineno-0-501"></a>        <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">raw_datasets_dir</span><span class="p">:</span>
+</span><span id="__span-0-502"><a id="__codelineno-0-502" name="__codelineno-0-502"></a>            <span class="c1"># automatically based on language + dataset_id</span>
+</span><span id="__span-0-503"><a id="__codelineno-0-503" name="__codelineno-0-503"></a>            <span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">raw_datasets_dir</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_language_code</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span><span class="p">)</span>
+</span><span id="__span-0-504"><a id="__codelineno-0-504" name="__codelineno-0-504"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-505"><a id="__codelineno-0-505" name="__codelineno-0-505"></a>            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Either `LOCAL_DIRS` or `raw_datasets_dir` must be defined.&quot;</span><span class="p">)</span>
+</span><span id="__span-0-506"><a id="__codelineno-0-506" name="__codelineno-0-506"></a>
+</span><span id="__span-0-507"><a id="__codelineno-0-507" name="__codelineno-0-507"></a>    <span class="k">def</span> <span class="nf">get_dataset_file_paths</span><span class="p">(</span>
+</span><span id="__span-0-508"><a id="__codelineno-0-508" name="__codelineno-0-508"></a>        <span class="bp">self</span><span class="p">,</span>
+</span><span id="__span-0-509"><a id="__codelineno-0-509" name="__codelineno-0-509"></a>        <span class="n">dataset_dir</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+</span><span id="__span-0-510"><a id="__codelineno-0-510" name="__codelineno-0-510"></a>        <span class="n">single_file</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+</span><span id="__span-0-511"><a id="__codelineno-0-511" name="__codelineno-0-511"></a>        <span class="n">subdirectories</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+</span><span id="__span-0-512"><a id="__codelineno-0-512" name="__codelineno-0-512"></a>        <span class="n">needed_suffix</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+</span><span id="__span-0-513"><a id="__codelineno-0-513" name="__codelineno-0-513"></a>        <span class="n">return_none_if_not_dir_exists</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+</span><span id="__span-0-514"><a id="__codelineno-0-514" name="__codelineno-0-514"></a>    <span class="p">):</span>
+</span><span id="__span-0-515"><a id="__codelineno-0-515" name="__codelineno-0-515"></a>        <span class="k">if</span> <span class="n">dataset_dir</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-516"><a id="__codelineno-0-516" name="__codelineno-0-516"></a>            <span class="n">dataset_dir</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_local_dataset_dir</span><span class="p">()</span>
+</span><span id="__span-0-517"><a id="__codelineno-0-517" name="__codelineno-0-517"></a>
+</span><span id="__span-0-518"><a id="__codelineno-0-518" name="__codelineno-0-518"></a>        <span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">dataset_dir</span><span class="p">):</span>
+</span><span id="__span-0-519"><a id="__codelineno-0-519" name="__codelineno-0-519"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Download directory does not exist: </span><span class="si">{</span><span class="n">dataset_dir</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-520"><a id="__codelineno-0-520" name="__codelineno-0-520"></a>
+</span><span id="__span-0-521"><a id="__codelineno-0-521" name="__codelineno-0-521"></a>            <span class="k">if</span> <span class="n">return_none_if_not_dir_exists</span><span class="p">:</span>
+</span><span id="__span-0-522"><a id="__codelineno-0-522" name="__codelineno-0-522"></a>                <span class="k">return</span> <span class="kc">None</span>
+</span><span id="__span-0-523"><a id="__codelineno-0-523" name="__codelineno-0-523"></a>            <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-524"><a id="__codelineno-0-524" name="__codelineno-0-524"></a>                <span class="k">return</span> <span class="p">[]</span>
+</span><span id="__span-0-525"><a id="__codelineno-0-525" name="__codelineno-0-525"></a>
+</span><span id="__span-0-526"><a id="__codelineno-0-526" name="__codelineno-0-526"></a>        <span class="k">if</span> <span class="n">subdirectories</span><span class="p">:</span>
+</span><span id="__span-0-527"><a id="__codelineno-0-527" name="__codelineno-0-527"></a>            <span class="c1"># find files in all subdirectories</span>
+</span><span id="__span-0-528"><a id="__codelineno-0-528" name="__codelineno-0-528"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Finding dataset files in all subdirectories: </span><span class="si">{</span><span class="n">dataset_dir</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-529"><a id="__codelineno-0-529" name="__codelineno-0-529"></a>            <span class="n">fps</span> <span class="o">=</span> <span class="p">[</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">path</span><span class="p">,</span> <span class="n">subdirs</span><span class="p">,</span> <span class="n">files</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">walk</span><span class="p">(</span><span class="n">dataset_dir</span><span class="p">)</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">files</span><span class="p">]</span>
+</span><span id="__span-0-530"><a id="__codelineno-0-530" name="__codelineno-0-530"></a>
+</span><span id="__span-0-531"><a id="__codelineno-0-531" name="__codelineno-0-531"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-532"><a id="__codelineno-0-532" name="__codelineno-0-532"></a>            <span class="c1"># root-level files</span>
+</span><span id="__span-0-533"><a id="__codelineno-0-533" name="__codelineno-0-533"></a>            <span class="n">fps</span> <span class="o">=</span> <span class="p">[</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">dataset_dir</span><span class="p">,</span> <span class="n">f</span><span class="p">)</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">listdir</span><span class="p">(</span><span class="n">dataset_dir</span><span class="p">)]</span>
+</span><span id="__span-0-534"><a id="__codelineno-0-534" name="__codelineno-0-534"></a>
+</span><span id="__span-0-535"><a id="__codelineno-0-535" name="__codelineno-0-535"></a>        <span class="c1"># filter by suffix</span>
+</span><span id="__span-0-536"><a id="__codelineno-0-536" name="__codelineno-0-536"></a>        <span class="n">fps</span> <span class="o">=</span> <span class="p">[</span><span class="n">f</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">fps</span> <span class="k">if</span> <span class="n">needed_suffix</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">f</span><span class="o">.</span><span class="n">endswith</span><span class="p">(</span><span class="n">needed_suffix</span><span class="p">)]</span>
+</span><span id="__span-0-537"><a id="__codelineno-0-537" name="__codelineno-0-537"></a>
+</span><span id="__span-0-538"><a id="__codelineno-0-538" name="__codelineno-0-538"></a>        <span class="c1"># filter by file type</span>
+</span><span id="__span-0-539"><a id="__codelineno-0-539" name="__codelineno-0-539"></a>        <span class="n">fps</span> <span class="o">=</span> <span class="p">[</span><span class="n">fp</span> <span class="k">for</span> <span class="n">fp</span> <span class="ow">in</span> <span class="n">fps</span> <span class="k">if</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">isfile</span><span class="p">(</span><span class="n">fp</span><span class="p">)]</span>
+</span><span id="__span-0-540"><a id="__codelineno-0-540" name="__codelineno-0-540"></a>
+</span><span id="__span-0-541"><a id="__codelineno-0-541" name="__codelineno-0-541"></a>        <span class="k">if</span> <span class="n">single_file</span><span class="p">:</span>
+</span><span id="__span-0-542"><a id="__codelineno-0-542" name="__codelineno-0-542"></a>            <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">fps</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
+</span><span id="__span-0-543"><a id="__codelineno-0-543" name="__codelineno-0-543"></a>                <span class="k">raise</span> <span class="ne">FileExistsError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Multiple files in download directory but only a single one was expected: </span><span class="si">{</span><span class="n">fps</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-544"><a id="__codelineno-0-544" name="__codelineno-0-544"></a>            <span class="k">elif</span> <span class="nb">len</span><span class="p">(</span><span class="n">fps</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+</span><span id="__span-0-545"><a id="__codelineno-0-545" name="__codelineno-0-545"></a>                <span class="k">raise</span> <span class="ne">FileNotFoundError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;No file found but a single one was expected: </span><span class="si">{</span><span class="n">fps</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-546"><a id="__codelineno-0-546" name="__codelineno-0-546"></a>
+</span><span id="__span-0-547"><a id="__codelineno-0-547" name="__codelineno-0-547"></a>            <span class="k">return</span> <span class="n">fps</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
+</span><span id="__span-0-548"><a id="__codelineno-0-548" name="__codelineno-0-548"></a>
+</span><span id="__span-0-549"><a id="__codelineno-0-549" name="__codelineno-0-549"></a>        <span class="k">return</span> <span class="n">fps</span>
+</span><span id="__span-0-550"><a id="__codelineno-0-550" name="__codelineno-0-550"></a>
+</span><span id="__span-0-551"><a id="__codelineno-0-551" name="__codelineno-0-551"></a>    <span class="k">def</span> <span class="nf">decompress</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-552"><a id="__codelineno-0-552" name="__codelineno-0-552"></a>        <span class="k">raise</span> <span class="ne">NotImplementedError</span>
+</span><span id="__span-0-553"><a id="__codelineno-0-553" name="__codelineno-0-553"></a>
+</span><span id="__span-0-554"><a id="__codelineno-0-554" name="__codelineno-0-554"></a>    <span class="k">def</span> <span class="nf">is_dummy</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-555"><a id="__codelineno-0-555" name="__codelineno-0-555"></a>        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">DUMMY</span>
+</span><span id="__span-0-556"><a id="__codelineno-0-556" name="__codelineno-0-556"></a>
+</span><span id="__span-0-557"><a id="__codelineno-0-557" name="__codelineno-0-557"></a>    <span class="k">def</span> <span class="nf">is_downloaded</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-558"><a id="__codelineno-0-558" name="__codelineno-0-558"></a>        <span class="k">return</span> <span class="kc">False</span>
+</span><span id="__span-0-559"><a id="__codelineno-0-559" name="__codelineno-0-559"></a>
+</span><span id="__span-0-560"><a id="__codelineno-0-560" name="__codelineno-0-560"></a>    <span class="k">def</span> <span class="nf">download</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-561"><a id="__codelineno-0-561" name="__codelineno-0-561"></a>        <span class="c1"># Download all DOWNLOAD_URLS into local dataset dir</span>
+</span><span id="__span-0-562"><a id="__codelineno-0-562" name="__codelineno-0-562"></a>        <span class="n">output_dir</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_local_dataset_dir</span><span class="p">()</span>
+</span><span id="__span-0-563"><a id="__codelineno-0-563" name="__codelineno-0-563"></a>
+</span><span id="__span-0-564"><a id="__codelineno-0-564" name="__codelineno-0-564"></a>        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Downloading </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">DOWNLOAD_URLS</span><span class="p">)</span><span class="si">}</span><span class="s2"> files to </span><span class="si">{</span><span class="n">output_dir</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-565"><a id="__codelineno-0-565" name="__codelineno-0-565"></a>
+</span><span id="__span-0-566"><a id="__codelineno-0-566" name="__codelineno-0-566"></a>        <span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">output_dir</span><span class="p">):</span>
+</span><span id="__span-0-567"><a id="__codelineno-0-567" name="__codelineno-0-567"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Creating download dir: </span><span class="si">{</span><span class="n">output_dir</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-568"><a id="__codelineno-0-568" name="__codelineno-0-568"></a>            <span class="n">os</span><span class="o">.</span><span class="n">makedirs</span><span class="p">(</span><span class="n">output_dir</span><span class="p">)</span>
+</span><span id="__span-0-569"><a id="__codelineno-0-569" name="__codelineno-0-569"></a>
+</span><span id="__span-0-570"><a id="__codelineno-0-570" name="__codelineno-0-570"></a>        <span class="k">for</span> <span class="n">source_url</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">DOWNLOAD_URLS</span><span class="p">:</span>
+</span><span id="__span-0-571"><a id="__codelineno-0-571" name="__codelineno-0-571"></a>            <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">source_url</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span>
+</span><span id="__span-0-572"><a id="__codelineno-0-572" name="__codelineno-0-572"></a>                <span class="n">source_url</span><span class="p">,</span> <span class="n">target_filename</span> <span class="o">=</span> <span class="n">source_url</span>
+</span><span id="__span-0-573"><a id="__codelineno-0-573" name="__codelineno-0-573"></a>                <span class="n">output_filepath</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">output_dir</span><span class="p">,</span> <span class="n">target_filename</span><span class="p">)</span>
+</span><span id="__span-0-574"><a id="__codelineno-0-574" name="__codelineno-0-574"></a>
+</span><span id="__span-0-575"><a id="__codelineno-0-575" name="__codelineno-0-575"></a>                <span class="k">if</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">output_filepath</span><span class="p">):</span>
+</span><span id="__span-0-576"><a id="__codelineno-0-576" name="__codelineno-0-576"></a>                    <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Output exists already: </span><span class="si">{</span><span class="n">output_filepath</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-577"><a id="__codelineno-0-577" name="__codelineno-0-577"></a>                    <span class="k">continue</span>
+</span><span id="__span-0-578"><a id="__codelineno-0-578" name="__codelineno-0-578"></a>            <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-579"><a id="__codelineno-0-579" name="__codelineno-0-579"></a>                <span class="n">output_filepath</span> <span class="o">=</span> <span class="n">output_dir</span>  <span class="c1"># auto file name</span>
+</span><span id="__span-0-580"><a id="__codelineno-0-580" name="__codelineno-0-580"></a>
+</span><span id="__span-0-581"><a id="__codelineno-0-581" name="__codelineno-0-581"></a>            <span class="k">try</span><span class="p">:</span>
+</span><span id="__span-0-582"><a id="__codelineno-0-582" name="__codelineno-0-582"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Download URL: </span><span class="si">{</span><span class="n">source_url</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-583"><a id="__codelineno-0-583" name="__codelineno-0-583"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Output path: </span><span class="si">{</span><span class="n">output_filepath</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-584"><a id="__codelineno-0-584" name="__codelineno-0-584"></a>
+</span><span id="__span-0-585"><a id="__codelineno-0-585" name="__codelineno-0-585"></a>                <span class="n">out_filename</span> <span class="o">=</span> <span class="n">wget</span><span class="o">.</span><span class="n">download</span><span class="p">(</span><span class="n">source_url</span><span class="p">,</span> <span class="n">out</span><span class="o">=</span><span class="n">output_filepath</span><span class="p">)</span>
+</span><span id="__span-0-586"><a id="__codelineno-0-586" name="__codelineno-0-586"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Completed </span><span class="si">{</span><span class="n">out_filename</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-587"><a id="__codelineno-0-587" name="__codelineno-0-587"></a>            <span class="k">except</span> <span class="n">HTTPError</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
+</span><span id="__span-0-588"><a id="__codelineno-0-588" name="__codelineno-0-588"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Error </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-589"><a id="__codelineno-0-589" name="__codelineno-0-589"></a>
+</span><span id="__span-0-590"><a id="__codelineno-0-590" name="__codelineno-0-590"></a>    <span class="k">def</span> <span class="nf">get_tokens</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-591"><a id="__codelineno-0-591" name="__codelineno-0-591"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">TOKENS</span><span class="p">:</span>
+</span><span id="__span-0-592"><a id="__codelineno-0-592" name="__codelineno-0-592"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">TOKENS</span>
+</span><span id="__span-0-593"><a id="__codelineno-0-593" name="__codelineno-0-593"></a>        <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_bytes</span><span class="p">():</span>
+</span><span id="__span-0-594"><a id="__codelineno-0-594" name="__codelineno-0-594"></a>            <span class="c1"># Estimate tokens based on bytes</span>
+</span><span id="__span-0-595"><a id="__codelineno-0-595" name="__codelineno-0-595"></a>            <span class="k">return</span> <span class="nb">int</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_bytes</span><span class="p">()</span> <span class="o">*</span> <span class="n">TOKENS_PER_BYTE</span><span class="p">)</span>
+</span><span id="__span-0-596"><a id="__codelineno-0-596" name="__codelineno-0-596"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-597"><a id="__codelineno-0-597" name="__codelineno-0-597"></a>            <span class="k">return</span> <span class="kc">None</span>
+</span><span id="__span-0-598"><a id="__codelineno-0-598" name="__codelineno-0-598"></a>
+</span><span id="__span-0-599"><a id="__codelineno-0-599" name="__codelineno-0-599"></a>    <span class="k">def</span> <span class="nf">get_bytes</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-600"><a id="__codelineno-0-600" name="__codelineno-0-600"></a>        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">BYTES</span>
+</span><span id="__span-0-601"><a id="__codelineno-0-601" name="__codelineno-0-601"></a>
+</span><span id="__span-0-602"><a id="__codelineno-0-602" name="__codelineno-0-602"></a>    <span class="k">def</span> <span class="nf">get_texts_from_conllu_file</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_handler</span><span class="p">:</span> <span class="n">TextIO</span><span class="p">):</span>
+</span><span id="__span-0-603"><a id="__codelineno-0-603" name="__codelineno-0-603"></a>        <span class="kn">import</span> <span class="nn">conllu</span>
+</span><span id="__span-0-604"><a id="__codelineno-0-604" name="__codelineno-0-604"></a>
+</span><span id="__span-0-605"><a id="__codelineno-0-605" name="__codelineno-0-605"></a>        <span class="n">text</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-606"><a id="__codelineno-0-606" name="__codelineno-0-606"></a>
+</span><span id="__span-0-607"><a id="__codelineno-0-607" name="__codelineno-0-607"></a>        <span class="c1"># try:</span>
+</span><span id="__span-0-608"><a id="__codelineno-0-608" name="__codelineno-0-608"></a>        <span class="k">for</span> <span class="n">sentence</span> <span class="ow">in</span> <span class="n">conllu</span><span class="o">.</span><span class="n">parse_incr</span><span class="p">(</span><span class="n">file_handler</span><span class="p">):</span>
+</span><span id="__span-0-609"><a id="__codelineno-0-609" name="__codelineno-0-609"></a>            <span class="k">if</span> <span class="s2">&quot;newdoc id&quot;</span> <span class="ow">in</span> <span class="n">sentence</span><span class="o">.</span><span class="n">metadata</span><span class="p">:</span>
+</span><span id="__span-0-610"><a id="__codelineno-0-610" name="__codelineno-0-610"></a>                <span class="k">if</span> <span class="n">text</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-611"><a id="__codelineno-0-611" name="__codelineno-0-611"></a>                    <span class="c1"># doc completed</span>
+</span><span id="__span-0-612"><a id="__codelineno-0-612" name="__codelineno-0-612"></a>                    <span class="k">yield</span> <span class="n">text</span>
+</span><span id="__span-0-613"><a id="__codelineno-0-613" name="__codelineno-0-613"></a>                <span class="n">text</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span>  <span class="c1"># init empty document</span>
+</span><span id="__span-0-614"><a id="__codelineno-0-614" name="__codelineno-0-614"></a>
+</span><span id="__span-0-615"><a id="__codelineno-0-615" name="__codelineno-0-615"></a>            <span class="c1"># append text to doc</span>
+</span><span id="__span-0-616"><a id="__codelineno-0-616" name="__codelineno-0-616"></a>            <span class="k">if</span> <span class="s2">&quot;text&quot;</span> <span class="ow">in</span> <span class="n">sentence</span><span class="o">.</span><span class="n">metadata</span><span class="p">:</span>
+</span><span id="__span-0-617"><a id="__codelineno-0-617" name="__codelineno-0-617"></a>                <span class="k">if</span> <span class="ow">not</span> <span class="n">text</span><span class="p">:</span>
+</span><span id="__span-0-618"><a id="__codelineno-0-618" name="__codelineno-0-618"></a>                    <span class="n">text</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span>  <span class="c1"># some conllu are not using doc ids -&gt; force init</span>
+</span><span id="__span-0-619"><a id="__codelineno-0-619" name="__codelineno-0-619"></a>                <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-620"><a id="__codelineno-0-620" name="__codelineno-0-620"></a>                    <span class="n">text</span> <span class="o">+=</span> <span class="s2">&quot; &quot;</span>  <span class="c1"># whitespace betweeen sentences</span>
+</span><span id="__span-0-621"><a id="__codelineno-0-621" name="__codelineno-0-621"></a>
+</span><span id="__span-0-622"><a id="__codelineno-0-622" name="__codelineno-0-622"></a>                <span class="n">text</span> <span class="o">+=</span> <span class="n">sentence</span><span class="o">.</span><span class="n">metadata</span><span class="p">[</span><span class="s2">&quot;text&quot;</span><span class="p">]</span>
+</span><span id="__span-0-623"><a id="__codelineno-0-623" name="__codelineno-0-623"></a>
+</span><span id="__span-0-624"><a id="__codelineno-0-624" name="__codelineno-0-624"></a>            <span class="k">if</span> <span class="s2">&quot;title&quot;</span> <span class="ow">in</span> <span class="n">sentence</span><span class="o">.</span><span class="n">metadata</span><span class="p">:</span>
+</span><span id="__span-0-625"><a id="__codelineno-0-625" name="__codelineno-0-625"></a>                <span class="n">text</span> <span class="o">+=</span> <span class="bp">self</span><span class="o">.</span><span class="n">title_delimiter</span>
+</span><span id="__span-0-626"><a id="__codelineno-0-626" name="__codelineno-0-626"></a>
+</span><span id="__span-0-627"><a id="__codelineno-0-627" name="__codelineno-0-627"></a>        <span class="c1"># yield last document</span>
+</span><span id="__span-0-628"><a id="__codelineno-0-628" name="__codelineno-0-628"></a>        <span class="k">if</span> <span class="n">text</span><span class="p">:</span>
+</span><span id="__span-0-629"><a id="__codelineno-0-629" name="__codelineno-0-629"></a>            <span class="k">yield</span> <span class="n">text</span>
+</span><span id="__span-0-630"><a id="__codelineno-0-630" name="__codelineno-0-630"></a>
+</span><span id="__span-0-631"><a id="__codelineno-0-631" name="__codelineno-0-631"></a>        <span class="c1"># except ParseException as e:</span>
+</span><span id="__span-0-632"><a id="__codelineno-0-632" name="__codelineno-0-632"></a>        <span class="c1">#     # TODO</span>
+</span><span id="__span-0-633"><a id="__codelineno-0-633" name="__codelineno-0-633"></a>        <span class="c1">#     logger.error(e)</span>
+</span><span id="__span-0-634"><a id="__codelineno-0-634" name="__codelineno-0-634"></a>
+</span><span id="__span-0-635"><a id="__codelineno-0-635" name="__codelineno-0-635"></a>    <span class="k">def</span> <span class="nf">get_texts</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
+</span><span id="__span-0-636"><a id="__codelineno-0-636" name="__codelineno-0-636"></a>        <span class="k">raise</span> <span class="ne">NotImplementedError</span>
+</span><span id="__span-0-637"><a id="__codelineno-0-637" name="__codelineno-0-637"></a>
+</span><span id="__span-0-638"><a id="__codelineno-0-638" name="__codelineno-0-638"></a>    <span class="k">def</span> <span class="nf">extract_plaintext</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
+</span><span id="__span-0-639"><a id="__codelineno-0-639" name="__codelineno-0-639"></a>        <span class="n">saved_texts_count</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">save_texts</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_texts</span><span class="p">())</span>
+</span><span id="__span-0-640"><a id="__codelineno-0-640" name="__codelineno-0-640"></a>
+</span><span id="__span-0-641"><a id="__codelineno-0-641" name="__codelineno-0-641"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">counter</span><span class="p">:</span>
+</span><span id="__span-0-642"><a id="__codelineno-0-642" name="__codelineno-0-642"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Statistics </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">counter</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-643"><a id="__codelineno-0-643" name="__codelineno-0-643"></a>
+</span><span id="__span-0-644"><a id="__codelineno-0-644" name="__codelineno-0-644"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">save_stats</span><span class="p">:</span>
+</span><span id="__span-0-645"><a id="__codelineno-0-645" name="__codelineno-0-645"></a>            <span class="bp">self</span><span class="o">.</span><span class="n">save_stats</span><span class="p">()</span>
+</span><span id="__span-0-646"><a id="__codelineno-0-646" name="__codelineno-0-646"></a>
+</span><span id="__span-0-647"><a id="__codelineno-0-647" name="__codelineno-0-647"></a>        <span class="k">return</span> <span class="n">saved_texts_count</span>
+</span><span id="__span-0-648"><a id="__codelineno-0-648" name="__codelineno-0-648"></a>
+</span><span id="__span-0-649"><a id="__codelineno-0-649" name="__codelineno-0-649"></a>    <span class="k">def</span> <span class="nf">get_output_rows_count</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">shuffled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
+</span><span id="__span-0-650"><a id="__codelineno-0-650" name="__codelineno-0-650"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;Read metadata from parquet files and extract number of rows&quot;&quot;&quot;</span>
+</span><span id="__span-0-651"><a id="__codelineno-0-651" name="__codelineno-0-651"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">==</span> <span class="s2">&quot;parquet&quot;</span><span class="p">:</span>
+</span><span id="__span-0-652"><a id="__codelineno-0-652" name="__codelineno-0-652"></a>            <span class="n">output_paths</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">))</span>
+</span><span id="__span-0-653"><a id="__codelineno-0-653" name="__codelineno-0-653"></a>
+</span><span id="__span-0-654"><a id="__codelineno-0-654" name="__codelineno-0-654"></a>            <span class="c1"># Filter for existing</span>
+</span><span id="__span-0-655"><a id="__codelineno-0-655" name="__codelineno-0-655"></a>            <span class="n">output_paths</span> <span class="o">=</span> <span class="p">[</span><span class="n">output_path</span> <span class="k">for</span> <span class="n">output_path</span> <span class="ow">in</span> <span class="n">output_paths</span> <span class="k">if</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">output_path</span><span class="p">)]</span>
+</span><span id="__span-0-656"><a id="__codelineno-0-656" name="__codelineno-0-656"></a>
+</span><span id="__span-0-657"><a id="__codelineno-0-657" name="__codelineno-0-657"></a>            <span class="k">if</span> <span class="n">output_paths</span><span class="p">:</span>
+</span><span id="__span-0-658"><a id="__codelineno-0-658" name="__codelineno-0-658"></a>                <span class="n">rows_count</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-659"><a id="__codelineno-0-659" name="__codelineno-0-659"></a>
+</span><span id="__span-0-660"><a id="__codelineno-0-660" name="__codelineno-0-660"></a>                <span class="k">for</span> <span class="n">output_path</span> <span class="ow">in</span> <span class="n">output_paths</span><span class="p">:</span>
+</span><span id="__span-0-661"><a id="__codelineno-0-661" name="__codelineno-0-661"></a>                    <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">output_path</span><span class="p">,</span> <span class="s2">&quot;rb&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+</span><span id="__span-0-662"><a id="__codelineno-0-662" name="__codelineno-0-662"></a>                        <span class="n">parquet_file</span> <span class="o">=</span> <span class="n">pq</span><span class="o">.</span><span class="n">ParquetFile</span><span class="p">(</span>
+</span><span id="__span-0-663"><a id="__codelineno-0-663" name="__codelineno-0-663"></a>                            <span class="n">f</span><span class="p">,</span>
+</span><span id="__span-0-664"><a id="__codelineno-0-664" name="__codelineno-0-664"></a>                            <span class="c1"># increased to avoid OSErrors</span>
+</span><span id="__span-0-665"><a id="__codelineno-0-665" name="__codelineno-0-665"></a>                            <span class="n">thrift_string_size_limit</span><span class="o">=</span><span class="mi">1000000000</span><span class="p">,</span>  <span class="c1"># default: 100000000</span>
+</span><span id="__span-0-666"><a id="__codelineno-0-666" name="__codelineno-0-666"></a>                            <span class="n">thrift_container_size_limit</span><span class="o">=</span><span class="mi">10000000</span><span class="p">,</span>  <span class="c1"># default: 1000000</span>
+</span><span id="__span-0-667"><a id="__codelineno-0-667" name="__codelineno-0-667"></a>                        <span class="p">)</span>
+</span><span id="__span-0-668"><a id="__codelineno-0-668" name="__codelineno-0-668"></a>                        <span class="n">rows_count</span> <span class="o">+=</span> <span class="n">parquet_file</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">num_rows</span>
+</span><span id="__span-0-669"><a id="__codelineno-0-669" name="__codelineno-0-669"></a>
+</span><span id="__span-0-670"><a id="__codelineno-0-670" name="__codelineno-0-670"></a>                        <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s2">&quot;Rows = </span><span class="si">%s</span><span class="s2"> in </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">rows_count</span><span class="p">,</span> <span class="n">output_path</span><span class="p">)</span>
+</span><span id="__span-0-671"><a id="__codelineno-0-671" name="__codelineno-0-671"></a>
+</span><span id="__span-0-672"><a id="__codelineno-0-672" name="__codelineno-0-672"></a>                <span class="k">return</span> <span class="n">rows_count</span>
+</span><span id="__span-0-673"><a id="__codelineno-0-673" name="__codelineno-0-673"></a>
+</span><span id="__span-0-674"><a id="__codelineno-0-674" name="__codelineno-0-674"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s2">&quot;No output files exists for </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span><span class="p">)</span>
+</span><span id="__span-0-675"><a id="__codelineno-0-675" name="__codelineno-0-675"></a>            <span class="k">return</span> <span class="o">-</span><span class="mi">1</span>
+</span><span id="__span-0-676"><a id="__codelineno-0-676" name="__codelineno-0-676"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-677"><a id="__codelineno-0-677" name="__codelineno-0-677"></a>            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Cannot determine the output rows count with </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">output_format</span><span class="si">=}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-678"><a id="__codelineno-0-678" name="__codelineno-0-678"></a>
+</span><span id="__span-0-679"><a id="__codelineno-0-679" name="__codelineno-0-679"></a>    <span class="k">def</span> <span class="nf">get_compression_from_output_files</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
+</span><span id="__span-0-680"><a id="__codelineno-0-680" name="__codelineno-0-680"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;NOTE: Currently only implemented for `parquet` format.&quot;&quot;&quot;</span>
+</span><span id="__span-0-681"><a id="__codelineno-0-681" name="__codelineno-0-681"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">==</span> <span class="s2">&quot;parquet&quot;</span><span class="p">:</span>
+</span><span id="__span-0-682"><a id="__codelineno-0-682" name="__codelineno-0-682"></a>            <span class="k">for</span> <span class="n">output_path</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">):</span>
+</span><span id="__span-0-683"><a id="__codelineno-0-683" name="__codelineno-0-683"></a>                <span class="k">if</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">output_path</span><span class="p">):</span>
+</span><span id="__span-0-684"><a id="__codelineno-0-684" name="__codelineno-0-684"></a>                    <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">output_path</span><span class="p">,</span> <span class="s2">&quot;rb&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+</span><span id="__span-0-685"><a id="__codelineno-0-685" name="__codelineno-0-685"></a>                        <span class="n">parquet_file</span> <span class="o">=</span> <span class="n">pq</span><span class="o">.</span><span class="n">ParquetFile</span><span class="p">(</span>
+</span><span id="__span-0-686"><a id="__codelineno-0-686" name="__codelineno-0-686"></a>                            <span class="n">f</span><span class="p">,</span>
+</span><span id="__span-0-687"><a id="__codelineno-0-687" name="__codelineno-0-687"></a>                            <span class="c1"># increased to avoid OSErrors</span>
+</span><span id="__span-0-688"><a id="__codelineno-0-688" name="__codelineno-0-688"></a>                            <span class="n">thrift_string_size_limit</span><span class="o">=</span><span class="mi">1000000000</span><span class="p">,</span>  <span class="c1"># default: 100000000</span>
+</span><span id="__span-0-689"><a id="__codelineno-0-689" name="__codelineno-0-689"></a>                            <span class="n">thrift_container_size_limit</span><span class="o">=</span><span class="mi">10000000</span><span class="p">,</span>  <span class="c1"># default: 1000000</span>
+</span><span id="__span-0-690"><a id="__codelineno-0-690" name="__codelineno-0-690"></a>                        <span class="p">)</span>
+</span><span id="__span-0-691"><a id="__codelineno-0-691" name="__codelineno-0-691"></a>                        <span class="n">parquet_metadata</span> <span class="o">=</span> <span class="n">parquet_file</span><span class="o">.</span><span class="n">metadata</span>
+</span><span id="__span-0-692"><a id="__codelineno-0-692" name="__codelineno-0-692"></a>                        <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">parquet_metadata</span><span class="o">.</span><span class="n">num_row_groups</span><span class="p">):</span>
+</span><span id="__span-0-693"><a id="__codelineno-0-693" name="__codelineno-0-693"></a>                            <span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">parquet_metadata</span><span class="o">.</span><span class="n">num_columns</span><span class="p">):</span>
+</span><span id="__span-0-694"><a id="__codelineno-0-694" name="__codelineno-0-694"></a>                                <span class="k">return</span> <span class="n">parquet_file</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">row_group</span><span class="p">(</span><span class="n">i</span><span class="p">)</span><span class="o">.</span><span class="n">column</span><span class="p">(</span><span class="n">j</span><span class="p">)</span><span class="o">.</span><span class="n">compression</span>
+</span><span id="__span-0-695"><a id="__codelineno-0-695" name="__codelineno-0-695"></a>
+</span><span id="__span-0-696"><a id="__codelineno-0-696" name="__codelineno-0-696"></a>        <span class="k">return</span> <span class="s2">&quot;unknown&quot;</span>
+</span><span id="__span-0-697"><a id="__codelineno-0-697" name="__codelineno-0-697"></a>
+</span><span id="__span-0-698"><a id="__codelineno-0-698" name="__codelineno-0-698"></a>    <span class="k">def</span> <span class="nf">generate_texts_from_output</span><span class="p">(</span>
+</span><span id="__span-0-699"><a id="__codelineno-0-699" name="__codelineno-0-699"></a>        <span class="bp">self</span><span class="p">,</span>
+</span><span id="__span-0-700"><a id="__codelineno-0-700" name="__codelineno-0-700"></a>        <span class="n">shuffled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+</span><span id="__span-0-701"><a id="__codelineno-0-701" name="__codelineno-0-701"></a>        <span class="n">batch_size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+</span><span id="__span-0-702"><a id="__codelineno-0-702" name="__codelineno-0-702"></a>        <span class="n">limit</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
+</span><span id="__span-0-703"><a id="__codelineno-0-703" name="__codelineno-0-703"></a>        <span class="n">offset</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
+</span><span id="__span-0-704"><a id="__codelineno-0-704" name="__codelineno-0-704"></a>        <span class="n">shuffle_output_file_paths</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+</span><span id="__span-0-705"><a id="__codelineno-0-705" name="__codelineno-0-705"></a>        <span class="n">reader_implementation</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span><span class="s2">&quot;polars_read_parquet&quot;</span><span class="p">,</span> <span class="s2">&quot;pyarrow&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;pyarrow&quot;</span><span class="p">,</span>
+</span><span id="__span-0-706"><a id="__codelineno-0-706" name="__codelineno-0-706"></a>        <span class="n">cast_to_py_string</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+</span><span id="__span-0-707"><a id="__codelineno-0-707" name="__codelineno-0-707"></a>    <span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">StringScalar</span><span class="p">]]:</span>
+</span><span id="__span-0-708"><a id="__codelineno-0-708" name="__codelineno-0-708"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;A iterator over texts from processed output files.&quot;&quot;&quot;</span>
+</span><span id="__span-0-709"><a id="__codelineno-0-709" name="__codelineno-0-709"></a>        <span class="k">if</span> <span class="n">batch_size</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-710"><a id="__codelineno-0-710" name="__codelineno-0-710"></a>            <span class="n">batch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_batch_size</span>
+</span><span id="__span-0-711"><a id="__codelineno-0-711" name="__codelineno-0-711"></a>
+</span><span id="__span-0-712"><a id="__codelineno-0-712" name="__codelineno-0-712"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">!=</span> <span class="s2">&quot;parquet&quot;</span><span class="p">:</span>
+</span><span id="__span-0-713"><a id="__codelineno-0-713" name="__codelineno-0-713"></a>            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Cannot generate texts with </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">output_format</span><span class="si">=}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-714"><a id="__codelineno-0-714" name="__codelineno-0-714"></a>
+</span><span id="__span-0-715"><a id="__codelineno-0-715" name="__codelineno-0-715"></a>        <span class="c1"># Check if output files exists and sort them</span>
+</span><span id="__span-0-716"><a id="__codelineno-0-716" name="__codelineno-0-716"></a>        <span class="n">output_paths</span> <span class="o">=</span> <span class="p">[</span>
+</span><span id="__span-0-717"><a id="__codelineno-0-717" name="__codelineno-0-717"></a>            <span class="n">file_path</span>
+</span><span id="__span-0-718"><a id="__codelineno-0-718" name="__codelineno-0-718"></a>            <span class="k">for</span> <span class="n">file_path</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">))</span>
+</span><span id="__span-0-719"><a id="__codelineno-0-719" name="__codelineno-0-719"></a>            <span class="k">if</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span>
+</span><span id="__span-0-720"><a id="__codelineno-0-720" name="__codelineno-0-720"></a>        <span class="p">]</span>
+</span><span id="__span-0-721"><a id="__codelineno-0-721" name="__codelineno-0-721"></a>
+</span><span id="__span-0-722"><a id="__codelineno-0-722" name="__codelineno-0-722"></a>        <span class="c1"># Count generated rows</span>
+</span><span id="__span-0-723"><a id="__codelineno-0-723" name="__codelineno-0-723"></a>        <span class="n">rows</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-724"><a id="__codelineno-0-724" name="__codelineno-0-724"></a>        <span class="n">rows_limit</span> <span class="o">=</span> <span class="n">limit</span> <span class="o">-</span> <span class="n">offset</span>
+</span><span id="__span-0-725"><a id="__codelineno-0-725" name="__codelineno-0-725"></a>
+</span><span id="__span-0-726"><a id="__codelineno-0-726" name="__codelineno-0-726"></a>        <span class="c1"># if limit &gt; 0:</span>
+</span><span id="__span-0-727"><a id="__codelineno-0-727" name="__codelineno-0-727"></a>        <span class="c1">#     batch_size = min(batch_size, limit)</span>
+</span><span id="__span-0-728"><a id="__codelineno-0-728" name="__codelineno-0-728"></a>
+</span><span id="__span-0-729"><a id="__codelineno-0-729" name="__codelineno-0-729"></a>        <span class="c1"># Shuffle output chunks:</span>
+</span><span id="__span-0-730"><a id="__codelineno-0-730" name="__codelineno-0-730"></a>        <span class="c1"># This changes the order in that the chunks are read ensure also shuffling on the full dataset level.</span>
+</span><span id="__span-0-731"><a id="__codelineno-0-731" name="__codelineno-0-731"></a>        <span class="k">if</span> <span class="n">shuffle_output_file_paths</span><span class="p">:</span>
+</span><span id="__span-0-732"><a id="__codelineno-0-732" name="__codelineno-0-732"></a>            <span class="n">random</span><span class="o">.</span><span class="n">seed</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">seed</span><span class="p">)</span>  <span class="c1"># reset seed to avoid inference by other scripts</span>
+</span><span id="__span-0-733"><a id="__codelineno-0-733" name="__codelineno-0-733"></a>            <span class="n">random</span><span class="o">.</span><span class="n">shuffle</span><span class="p">(</span><span class="n">output_paths</span><span class="p">)</span>
+</span><span id="__span-0-734"><a id="__codelineno-0-734" name="__codelineno-0-734"></a>
+</span><span id="__span-0-735"><a id="__codelineno-0-735" name="__codelineno-0-735"></a>        <span class="n">chunk_start</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-736"><a id="__codelineno-0-736" name="__codelineno-0-736"></a>        <span class="n">chunk_end</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-737"><a id="__codelineno-0-737" name="__codelineno-0-737"></a>
+</span><span id="__span-0-738"><a id="__codelineno-0-738" name="__codelineno-0-738"></a>        <span class="k">if</span> <span class="n">output_paths</span><span class="p">:</span>
+</span><span id="__span-0-739"><a id="__codelineno-0-739" name="__codelineno-0-739"></a>            <span class="k">for</span> <span class="n">file_path</span> <span class="ow">in</span> <span class="n">output_paths</span><span class="p">:</span>
+</span><span id="__span-0-740"><a id="__codelineno-0-740" name="__codelineno-0-740"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Generating text from </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">file_path</span><span class="p">)</span>
+</span><span id="__span-0-741"><a id="__codelineno-0-741" name="__codelineno-0-741"></a>
+</span><span id="__span-0-742"><a id="__codelineno-0-742" name="__codelineno-0-742"></a>                <span class="c1"># PyArrow implementation</span>
+</span><span id="__span-0-743"><a id="__codelineno-0-743" name="__codelineno-0-743"></a>                <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">file_path</span><span class="p">,</span> <span class="s2">&quot;rb&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">file_handler</span><span class="p">:</span>
+</span><span id="__span-0-744"><a id="__codelineno-0-744" name="__codelineno-0-744"></a>                    <span class="n">pq_file</span> <span class="o">=</span> <span class="n">pq</span><span class="o">.</span><span class="n">ParquetFile</span><span class="p">(</span>
+</span><span id="__span-0-745"><a id="__codelineno-0-745" name="__codelineno-0-745"></a>                        <span class="n">file_handler</span><span class="p">,</span>
+</span><span id="__span-0-746"><a id="__codelineno-0-746" name="__codelineno-0-746"></a>                        <span class="c1"># memory_map=False,</span>
+</span><span id="__span-0-747"><a id="__codelineno-0-747" name="__codelineno-0-747"></a>                    <span class="p">)</span>
+</span><span id="__span-0-748"><a id="__codelineno-0-748" name="__codelineno-0-748"></a>                    <span class="n">file_rows_count</span> <span class="o">=</span> <span class="n">pq_file</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">num_rows</span>
+</span><span id="__span-0-749"><a id="__codelineno-0-749" name="__codelineno-0-749"></a>
+</span><span id="__span-0-750"><a id="__codelineno-0-750" name="__codelineno-0-750"></a>                    <span class="n">chunk_end</span> <span class="o">=</span> <span class="n">chunk_start</span> <span class="o">+</span> <span class="n">file_rows_count</span> <span class="o">-</span> <span class="mi">1</span>
+</span><span id="__span-0-751"><a id="__codelineno-0-751" name="__codelineno-0-751"></a>
+</span><span id="__span-0-752"><a id="__codelineno-0-752" name="__codelineno-0-752"></a>                    <span class="c1"># Should we read from the current chunk?</span>
+</span><span id="__span-0-753"><a id="__codelineno-0-753" name="__codelineno-0-753"></a>                    <span class="c1"># Yes, if</span>
+</span><span id="__span-0-754"><a id="__codelineno-0-754" name="__codelineno-0-754"></a>                    <span class="c1"># - offset is smaller or equal chunk_start</span>
+</span><span id="__span-0-755"><a id="__codelineno-0-755" name="__codelineno-0-755"></a>                    <span class="c1"># (- limit is greater or equal chunk_end) --- limit does not matter</span>
+</span><span id="__span-0-756"><a id="__codelineno-0-756" name="__codelineno-0-756"></a>
+</span><span id="__span-0-757"><a id="__codelineno-0-757" name="__codelineno-0-757"></a>                    <span class="c1"># variants</span>
+</span><span id="__span-0-758"><a id="__codelineno-0-758" name="__codelineno-0-758"></a>                    <span class="c1"># A) requested rows start in chunk and ends in chunk</span>
+</span><span id="__span-0-759"><a id="__codelineno-0-759" name="__codelineno-0-759"></a>                    <span class="c1"># B) requested rows start in chunk but ends in following chunk</span>
+</span><span id="__span-0-760"><a id="__codelineno-0-760" name="__codelineno-0-760"></a>                    <span class="c1"># C) requested rows start before chunk and ends in chunk</span>
+</span><span id="__span-0-761"><a id="__codelineno-0-761" name="__codelineno-0-761"></a>                    <span class="c1"># D) requested rows start before chunk and ends in following chunk</span>
+</span><span id="__span-0-762"><a id="__codelineno-0-762" name="__codelineno-0-762"></a>
+</span><span id="__span-0-763"><a id="__codelineno-0-763" name="__codelineno-0-763"></a>                    <span class="k">if</span> <span class="p">(</span>
+</span><span id="__span-0-764"><a id="__codelineno-0-764" name="__codelineno-0-764"></a>                        <span class="n">chunk_start</span> <span class="o">&lt;=</span> <span class="n">offset</span> <span class="o">&lt;</span> <span class="n">chunk_end</span>
+</span><span id="__span-0-765"><a id="__codelineno-0-765" name="__codelineno-0-765"></a>                        <span class="ow">or</span> <span class="n">offset</span> <span class="o">&lt;</span> <span class="n">chunk_start</span>
+</span><span id="__span-0-766"><a id="__codelineno-0-766" name="__codelineno-0-766"></a>                        <span class="ow">and</span> <span class="p">(</span><span class="n">limit</span> <span class="o">==</span> <span class="mi">0</span> <span class="ow">or</span> <span class="n">chunk_start</span> <span class="o">&lt;</span> <span class="n">limit</span><span class="p">)</span>
+</span><span id="__span-0-767"><a id="__codelineno-0-767" name="__codelineno-0-767"></a>                    <span class="p">):</span>
+</span><span id="__span-0-768"><a id="__codelineno-0-768" name="__codelineno-0-768"></a>                        <span class="n">file_offset</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span>
+</span><span id="__span-0-769"><a id="__codelineno-0-769" name="__codelineno-0-769"></a>                            <span class="mi">0</span><span class="p">,</span> <span class="n">offset</span> <span class="o">-</span> <span class="n">chunk_start</span>
+</span><span id="__span-0-770"><a id="__codelineno-0-770" name="__codelineno-0-770"></a>                        <span class="p">)</span>  <span class="c1"># global offset minus start of current file (current chunk)</span>
+</span><span id="__span-0-771"><a id="__codelineno-0-771" name="__codelineno-0-771"></a>                        <span class="n">file_limit</span> <span class="o">=</span> <span class="p">(</span>
+</span><span id="__span-0-772"><a id="__codelineno-0-772" name="__codelineno-0-772"></a>                            <span class="nb">max</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">limit</span> <span class="o">-</span> <span class="n">chunk_start</span><span class="p">)</span> <span class="k">if</span> <span class="n">limit</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="k">else</span> <span class="mi">0</span>  <span class="c1"># limit - chunk_start</span>
+</span><span id="__span-0-773"><a id="__codelineno-0-773" name="__codelineno-0-773"></a>                        <span class="p">)</span>  <span class="c1"># Length of the slice: global limit minus start of current chunk</span>
+</span><span id="__span-0-774"><a id="__codelineno-0-774" name="__codelineno-0-774"></a>                        <span class="c1"># TODO before: limit - chunk_start - file_offset</span>
+</span><span id="__span-0-775"><a id="__codelineno-0-775" name="__codelineno-0-775"></a>
+</span><span id="__span-0-776"><a id="__codelineno-0-776" name="__codelineno-0-776"></a>                        <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span>
+</span><span id="__span-0-777"><a id="__codelineno-0-777" name="__codelineno-0-777"></a>                            <span class="s2">&quot;Reading file chunk from </span><span class="si">%s</span><span class="s2">: file [</span><span class="si">%s</span><span class="s2"> - </span><span class="si">%s</span><span class="s2">]; global [</span><span class="si">%s</span><span class="s2"> - </span><span class="si">%s</span><span class="s2">]; chunk [</span><span class="si">%s</span><span class="s2"> - </span><span class="si">%s</span><span class="s2">]&quot;</span><span class="p">,</span>
+</span><span id="__span-0-778"><a id="__codelineno-0-778" name="__codelineno-0-778"></a>                            <span class="n">file_path</span><span class="p">,</span>
+</span><span id="__span-0-779"><a id="__codelineno-0-779" name="__codelineno-0-779"></a>                            <span class="n">file_offset</span><span class="p">,</span>
+</span><span id="__span-0-780"><a id="__codelineno-0-780" name="__codelineno-0-780"></a>                            <span class="n">file_limit</span><span class="p">,</span>
+</span><span id="__span-0-781"><a id="__codelineno-0-781" name="__codelineno-0-781"></a>                            <span class="n">offset</span><span class="p">,</span>
+</span><span id="__span-0-782"><a id="__codelineno-0-782" name="__codelineno-0-782"></a>                            <span class="n">limit</span><span class="p">,</span>
+</span><span id="__span-0-783"><a id="__codelineno-0-783" name="__codelineno-0-783"></a>                            <span class="n">chunk_start</span><span class="p">,</span>
+</span><span id="__span-0-784"><a id="__codelineno-0-784" name="__codelineno-0-784"></a>                            <span class="n">chunk_end</span><span class="p">,</span>
+</span><span id="__span-0-785"><a id="__codelineno-0-785" name="__codelineno-0-785"></a>                        <span class="p">)</span>
+</span><span id="__span-0-786"><a id="__codelineno-0-786" name="__codelineno-0-786"></a>                        <span class="k">if</span> <span class="n">reader_implementation</span> <span class="o">==</span> <span class="s2">&quot;pyarrow&quot;</span><span class="p">:</span>
+</span><span id="__span-0-787"><a id="__codelineno-0-787" name="__codelineno-0-787"></a>                            <span class="c1"># PyArrow implementation with iter_batches</span>
+</span><span id="__span-0-788"><a id="__codelineno-0-788" name="__codelineno-0-788"></a>                            <span class="c1"># with open(file_path, &quot;rb&quot;) as file_handler:</span>
+</span><span id="__span-0-789"><a id="__codelineno-0-789" name="__codelineno-0-789"></a>                            <span class="c1">#     parquet_file = pq.ParquetFile(file_handler)</span>
+</span><span id="__span-0-790"><a id="__codelineno-0-790" name="__codelineno-0-790"></a>
+</span><span id="__span-0-791"><a id="__codelineno-0-791" name="__codelineno-0-791"></a>                            <span class="k">for</span> <span class="n">batch_idx</span><span class="p">,</span> <span class="n">pq_batch</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span>
+</span><span id="__span-0-792"><a id="__codelineno-0-792" name="__codelineno-0-792"></a>                                <span class="n">pq_file</span><span class="o">.</span><span class="n">iter_batches</span><span class="p">(</span>
+</span><span id="__span-0-793"><a id="__codelineno-0-793" name="__codelineno-0-793"></a>                                    <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_text_field</span><span class="p">()],</span> <span class="n">batch_size</span><span class="o">=</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">use_threads</span><span class="o">=</span><span class="kc">False</span>
+</span><span id="__span-0-794"><a id="__codelineno-0-794" name="__codelineno-0-794"></a>                                <span class="p">)</span>
+</span><span id="__span-0-795"><a id="__codelineno-0-795" name="__codelineno-0-795"></a>                            <span class="p">):</span>
+</span><span id="__span-0-796"><a id="__codelineno-0-796" name="__codelineno-0-796"></a>                                <span class="k">for</span> <span class="n">row_idx</span><span class="p">,</span> <span class="n">text_column</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">pq_batch</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">batch_idx</span> <span class="o">*</span> <span class="n">batch_size</span><span class="p">):</span>
+</span><span id="__span-0-797"><a id="__codelineno-0-797" name="__codelineno-0-797"></a>                                    <span class="k">if</span> <span class="n">row_idx</span> <span class="o">&gt;=</span> <span class="n">file_offset</span><span class="p">:</span>
+</span><span id="__span-0-798"><a id="__codelineno-0-798" name="__codelineno-0-798"></a>                                        <span class="k">if</span> <span class="n">rows_limit</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="n">rows</span> <span class="o">&gt;=</span> <span class="n">rows_limit</span><span class="p">:</span>
+</span><span id="__span-0-799"><a id="__codelineno-0-799" name="__codelineno-0-799"></a>                                            <span class="c1"># break row loop</span>
+</span><span id="__span-0-800"><a id="__codelineno-0-800" name="__codelineno-0-800"></a>                                            <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s2">&quot;break row loop&quot;</span><span class="p">)</span>
+</span><span id="__span-0-801"><a id="__codelineno-0-801" name="__codelineno-0-801"></a>                                            <span class="k">break</span>
+</span><span id="__span-0-802"><a id="__codelineno-0-802" name="__codelineno-0-802"></a>
+</span><span id="__span-0-803"><a id="__codelineno-0-803" name="__codelineno-0-803"></a>                                        <span class="n">text</span><span class="p">:</span> <span class="n">pa</span><span class="o">.</span><span class="n">StringScalar</span> <span class="o">=</span> <span class="n">text_column</span>
+</span><span id="__span-0-804"><a id="__codelineno-0-804" name="__codelineno-0-804"></a>
+</span><span id="__span-0-805"><a id="__codelineno-0-805" name="__codelineno-0-805"></a>                                        <span class="k">if</span> <span class="n">cast_to_py_string</span><span class="p">:</span>
+</span><span id="__span-0-806"><a id="__codelineno-0-806" name="__codelineno-0-806"></a>                                            <span class="c1"># cast to string</span>
+</span><span id="__span-0-807"><a id="__codelineno-0-807" name="__codelineno-0-807"></a>                                            <span class="n">text</span> <span class="o">=</span> <span class="n">text_column</span><span class="o">.</span><span class="n">as_py</span><span class="p">()</span>
+</span><span id="__span-0-808"><a id="__codelineno-0-808" name="__codelineno-0-808"></a>
+</span><span id="__span-0-809"><a id="__codelineno-0-809" name="__codelineno-0-809"></a>                                        <span class="k">yield</span> <span class="n">text</span>
+</span><span id="__span-0-810"><a id="__codelineno-0-810" name="__codelineno-0-810"></a>                                        <span class="n">rows</span> <span class="o">+=</span> <span class="mi">1</span>
+</span><span id="__span-0-811"><a id="__codelineno-0-811" name="__codelineno-0-811"></a>
+</span><span id="__span-0-812"><a id="__codelineno-0-812" name="__codelineno-0-812"></a>                                <span class="k">if</span> <span class="n">rows_limit</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="n">rows</span> <span class="o">&gt;=</span> <span class="n">rows_limit</span><span class="p">:</span>
+</span><span id="__span-0-813"><a id="__codelineno-0-813" name="__codelineno-0-813"></a>                                    <span class="c1"># break batch loop</span>
+</span><span id="__span-0-814"><a id="__codelineno-0-814" name="__codelineno-0-814"></a>                                    <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s2">&quot;break batch loop&quot;</span><span class="p">)</span>
+</span><span id="__span-0-815"><a id="__codelineno-0-815" name="__codelineno-0-815"></a>                                    <span class="k">break</span>
+</span><span id="__span-0-816"><a id="__codelineno-0-816" name="__codelineno-0-816"></a>
+</span><span id="__span-0-817"><a id="__codelineno-0-817" name="__codelineno-0-817"></a>                            <span class="c1"># PyArrow implementation with read_row_group</span>
+</span><span id="__span-0-818"><a id="__codelineno-0-818" name="__codelineno-0-818"></a>                            <span class="c1"># with open(file_path, &quot;rb&quot;) as file_handler:</span>
+</span><span id="__span-0-819"><a id="__codelineno-0-819" name="__codelineno-0-819"></a>                            <span class="c1">#     parquet_file = pq.ParquetFile(file_handler)</span>
+</span><span id="__span-0-820"><a id="__codelineno-0-820" name="__codelineno-0-820"></a>
+</span><span id="__span-0-821"><a id="__codelineno-0-821" name="__codelineno-0-821"></a>                            <span class="c1">#     # 1. What row groups need to be read?</span>
+</span><span id="__span-0-822"><a id="__codelineno-0-822" name="__codelineno-0-822"></a>                            <span class="c1">#     row_groups, group_idx_to_offset_last_row = get_selected_row_groups(</span>
+</span><span id="__span-0-823"><a id="__codelineno-0-823" name="__codelineno-0-823"></a>                            <span class="c1">#         parquet_file, file_offset, file_limit</span>
+</span><span id="__span-0-824"><a id="__codelineno-0-824" name="__codelineno-0-824"></a>                            <span class="c1">#     )</span>
+</span><span id="__span-0-825"><a id="__codelineno-0-825" name="__codelineno-0-825"></a>                            <span class="c1">#     logger.debug(&quot;Selected row groups: %s; %s&quot;, row_groups, group_idx_to_offset_last_row)</span>
+</span><span id="__span-0-826"><a id="__codelineno-0-826" name="__codelineno-0-826"></a>
+</span><span id="__span-0-827"><a id="__codelineno-0-827" name="__codelineno-0-827"></a>                            <span class="c1">#     # 2. Read selected row groups</span>
+</span><span id="__span-0-828"><a id="__codelineno-0-828" name="__codelineno-0-828"></a>                            <span class="c1">#     for selected_row_group in row_groups:</span>
+</span><span id="__span-0-829"><a id="__codelineno-0-829" name="__codelineno-0-829"></a>                            <span class="c1">#         logger.debug(&quot;Read row group: %s&quot;, selected_row_group)</span>
+</span><span id="__span-0-830"><a id="__codelineno-0-830" name="__codelineno-0-830"></a>                            <span class="c1">#         group_table = parquet_file.read_row_group(</span>
+</span><span id="__span-0-831"><a id="__codelineno-0-831" name="__codelineno-0-831"></a>                            <span class="c1">#             selected_row_group, columns=[self.get_output_text_field()]</span>
+</span><span id="__span-0-832"><a id="__codelineno-0-832" name="__codelineno-0-832"></a>                            <span class="c1">#         )</span>
+</span><span id="__span-0-833"><a id="__codelineno-0-833" name="__codelineno-0-833"></a>
+</span><span id="__span-0-834"><a id="__codelineno-0-834" name="__codelineno-0-834"></a>                            <span class="c1">#         # What offsets and limit? (only if needed)</span>
+</span><span id="__span-0-835"><a id="__codelineno-0-835" name="__codelineno-0-835"></a>                            <span class="c1">#         if group_idx_to_offset_last_row is not None:</span>
+</span><span id="__span-0-836"><a id="__codelineno-0-836" name="__codelineno-0-836"></a>                            <span class="c1">#             group_offset, _ = group_idx_to_offset_last_row[selected_row_group]</span>
+</span><span id="__span-0-837"><a id="__codelineno-0-837" name="__codelineno-0-837"></a>
+</span><span id="__span-0-838"><a id="__codelineno-0-838" name="__codelineno-0-838"></a>                            <span class="c1">#             row_offset = max(0, file_offset - group_offset)</span>
+</span><span id="__span-0-839"><a id="__codelineno-0-839" name="__codelineno-0-839"></a>                            <span class="c1">#             logger.debug(&quot;Row group: %s; row offset: %s&quot;, selected_row_group, row_offset)</span>
+</span><span id="__span-0-840"><a id="__codelineno-0-840" name="__codelineno-0-840"></a>
+</span><span id="__span-0-841"><a id="__codelineno-0-841" name="__codelineno-0-841"></a>                            <span class="c1">#         # Iterate over rows</span>
+</span><span id="__span-0-842"><a id="__codelineno-0-842" name="__codelineno-0-842"></a>                            <span class="c1">#         for row_idx, text_column in enumerate(group_table.columns[0]):</span>
+</span><span id="__span-0-843"><a id="__codelineno-0-843" name="__codelineno-0-843"></a>                            <span class="c1">#             # Skip rows before offset</span>
+</span><span id="__span-0-844"><a id="__codelineno-0-844" name="__codelineno-0-844"></a>                            <span class="c1">#             if group_idx_to_offset_last_row is None or row_idx &gt;= row_offset:</span>
+</span><span id="__span-0-845"><a id="__codelineno-0-845" name="__codelineno-0-845"></a>                            <span class="c1">#                 if rows_limit &gt; 0 and rows &gt;= rows_limit:</span>
+</span><span id="__span-0-846"><a id="__codelineno-0-846" name="__codelineno-0-846"></a>                            <span class="c1">#                     # break row loop</span>
+</span><span id="__span-0-847"><a id="__codelineno-0-847" name="__codelineno-0-847"></a>                            <span class="c1">#                     logger.debug(&quot;break row loop&quot;)</span>
+</span><span id="__span-0-848"><a id="__codelineno-0-848" name="__codelineno-0-848"></a>                            <span class="c1">#                     break</span>
+</span><span id="__span-0-849"><a id="__codelineno-0-849" name="__codelineno-0-849"></a>
+</span><span id="__span-0-850"><a id="__codelineno-0-850" name="__codelineno-0-850"></a>                            <span class="c1">#                 text = text_column.as_py()  # cast to str</span>
+</span><span id="__span-0-851"><a id="__codelineno-0-851" name="__codelineno-0-851"></a>                            <span class="c1">#                 yield text</span>
+</span><span id="__span-0-852"><a id="__codelineno-0-852" name="__codelineno-0-852"></a>                            <span class="c1">#                 rows += 1</span>
+</span><span id="__span-0-853"><a id="__codelineno-0-853" name="__codelineno-0-853"></a>
+</span><span id="__span-0-854"><a id="__codelineno-0-854" name="__codelineno-0-854"></a>                            <span class="c1">#         if rows_limit &gt; 0 and rows &gt;= rows_limit:</span>
+</span><span id="__span-0-855"><a id="__codelineno-0-855" name="__codelineno-0-855"></a>                            <span class="c1">#             # break row group loop</span>
+</span><span id="__span-0-856"><a id="__codelineno-0-856" name="__codelineno-0-856"></a>                            <span class="c1">#             logger.debug(&quot;break row group loop&quot;)</span>
+</span><span id="__span-0-857"><a id="__codelineno-0-857" name="__codelineno-0-857"></a>                            <span class="c1">#             break</span>
+</span><span id="__span-0-858"><a id="__codelineno-0-858" name="__codelineno-0-858"></a>
+</span><span id="__span-0-859"><a id="__codelineno-0-859" name="__codelineno-0-859"></a>                        <span class="k">elif</span> <span class="n">reader_implementation</span> <span class="o">==</span> <span class="s2">&quot;polars_read_parquet&quot;</span><span class="p">:</span>
+</span><span id="__span-0-860"><a id="__codelineno-0-860" name="__codelineno-0-860"></a>                            <span class="c1"># Polars &quot;scan_parquet&quot; implementation: Error &quot;Segmentation fault (core dumped)&quot;</span>
+</span><span id="__span-0-861"><a id="__codelineno-0-861" name="__codelineno-0-861"></a>                            <span class="c1"># df = (</span>
+</span><span id="__span-0-862"><a id="__codelineno-0-862" name="__codelineno-0-862"></a>                            <span class="c1">#     pl.scan_parquet(file_path, low_memory=True).collect(</span>
+</span><span id="__span-0-863"><a id="__codelineno-0-863" name="__codelineno-0-863"></a>                            <span class="c1">#     streaming=True</span>
+</span><span id="__span-0-864"><a id="__codelineno-0-864" name="__codelineno-0-864"></a>                            <span class="c1"># ).slice(offset=file_offset, length=file_limit if file_limit != 0 else None)</span>
+</span><span id="__span-0-865"><a id="__codelineno-0-865" name="__codelineno-0-865"></a>                            <span class="c1">#     .collect(streaming=True)</span>
+</span><span id="__span-0-866"><a id="__codelineno-0-866" name="__codelineno-0-866"></a>                            <span class="c1"># )</span>
+</span><span id="__span-0-867"><a id="__codelineno-0-867" name="__codelineno-0-867"></a>                            <span class="c1"># text_column_index = df.columns.index(self.get_output_text_field())</span>
+</span><span id="__span-0-868"><a id="__codelineno-0-868" name="__codelineno-0-868"></a>
+</span><span id="__span-0-869"><a id="__codelineno-0-869" name="__codelineno-0-869"></a>                            <span class="n">df</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_parquet</span><span class="p">(</span>
+</span><span id="__span-0-870"><a id="__codelineno-0-870" name="__codelineno-0-870"></a>                                <span class="n">file_path</span><span class="p">,</span> <span class="n">low_memory</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_text_field</span><span class="p">()]</span>
+</span><span id="__span-0-871"><a id="__codelineno-0-871" name="__codelineno-0-871"></a>                            <span class="p">)</span><span class="o">.</span><span class="n">slice</span><span class="p">(</span><span class="n">offset</span><span class="o">=</span><span class="n">file_offset</span><span class="p">,</span> <span class="n">length</span><span class="o">=</span><span class="n">file_limit</span> <span class="k">if</span> <span class="n">file_limit</span> <span class="o">!=</span> <span class="mi">0</span> <span class="k">else</span> <span class="kc">None</span><span class="p">)</span>
+</span><span id="__span-0-872"><a id="__codelineno-0-872" name="__codelineno-0-872"></a>                            <span class="n">text_column_index</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-873"><a id="__codelineno-0-873" name="__codelineno-0-873"></a>
+</span><span id="__span-0-874"><a id="__codelineno-0-874" name="__codelineno-0-874"></a>                            <span class="c1"># Iterate over rows</span>
+</span><span id="__span-0-875"><a id="__codelineno-0-875" name="__codelineno-0-875"></a>                            <span class="k">for</span> <span class="n">row</span> <span class="ow">in</span> <span class="n">df</span><span class="o">.</span><span class="n">iter_rows</span><span class="p">():</span>
+</span><span id="__span-0-876"><a id="__codelineno-0-876" name="__codelineno-0-876"></a>                                <span class="n">text</span> <span class="o">=</span> <span class="n">row</span><span class="p">[</span><span class="n">text_column_index</span><span class="p">]</span>
+</span><span id="__span-0-877"><a id="__codelineno-0-877" name="__codelineno-0-877"></a>
+</span><span id="__span-0-878"><a id="__codelineno-0-878" name="__codelineno-0-878"></a>                                <span class="k">if</span> <span class="n">cast_to_py_string</span><span class="p">:</span>
+</span><span id="__span-0-879"><a id="__codelineno-0-879" name="__codelineno-0-879"></a>                                    <span class="n">text</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
+</span><span id="__span-0-880"><a id="__codelineno-0-880" name="__codelineno-0-880"></a>
+</span><span id="__span-0-881"><a id="__codelineno-0-881" name="__codelineno-0-881"></a>                                <span class="k">yield</span> <span class="n">text</span>
+</span><span id="__span-0-882"><a id="__codelineno-0-882" name="__codelineno-0-882"></a>                                <span class="n">rows</span> <span class="o">+=</span> <span class="mi">1</span>
+</span><span id="__span-0-883"><a id="__codelineno-0-883" name="__codelineno-0-883"></a>
+</span><span id="__span-0-884"><a id="__codelineno-0-884" name="__codelineno-0-884"></a>                                <span class="k">if</span> <span class="n">rows_limit</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="n">rows</span> <span class="o">&gt;=</span> <span class="n">rows_limit</span><span class="p">:</span>
+</span><span id="__span-0-885"><a id="__codelineno-0-885" name="__codelineno-0-885"></a>                                    <span class="c1"># break row loop</span>
+</span><span id="__span-0-886"><a id="__codelineno-0-886" name="__codelineno-0-886"></a>                                    <span class="k">break</span>
+</span><span id="__span-0-887"><a id="__codelineno-0-887" name="__codelineno-0-887"></a>                            <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-888"><a id="__codelineno-0-888" name="__codelineno-0-888"></a>                                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Invalid `reader_implementation`&quot;</span><span class="p">)</span>
+</span><span id="__span-0-889"><a id="__codelineno-0-889" name="__codelineno-0-889"></a>                    <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-890"><a id="__codelineno-0-890" name="__codelineno-0-890"></a>                        <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s2">&quot;Skip this file because output does not contain the requested rows: </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">file_path</span><span class="p">)</span>
+</span><span id="__span-0-891"><a id="__codelineno-0-891" name="__codelineno-0-891"></a>
+</span><span id="__span-0-892"><a id="__codelineno-0-892" name="__codelineno-0-892"></a>                    <span class="c1"># current_offset += file_rows_count  # TODO +1?</span>
+</span><span id="__span-0-893"><a id="__codelineno-0-893" name="__codelineno-0-893"></a>                    <span class="n">chunk_start</span> <span class="o">=</span> <span class="n">chunk_end</span> <span class="o">+</span> <span class="mi">1</span>  <span class="c1"># set start for the next chunk</span>
+</span><span id="__span-0-894"><a id="__codelineno-0-894" name="__codelineno-0-894"></a>
+</span><span id="__span-0-895"><a id="__codelineno-0-895" name="__codelineno-0-895"></a>                <span class="k">if</span> <span class="n">rows_limit</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="n">rows</span> <span class="o">&gt;=</span> <span class="n">rows_limit</span><span class="p">:</span>
+</span><span id="__span-0-896"><a id="__codelineno-0-896" name="__codelineno-0-896"></a>                    <span class="c1"># break file loop</span>
+</span><span id="__span-0-897"><a id="__codelineno-0-897" name="__codelineno-0-897"></a>                    <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s2">&quot;break file loop&quot;</span><span class="p">)</span>
+</span><span id="__span-0-898"><a id="__codelineno-0-898" name="__codelineno-0-898"></a>                    <span class="k">break</span>
+</span><span id="__span-0-899"><a id="__codelineno-0-899" name="__codelineno-0-899"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-900"><a id="__codelineno-0-900" name="__codelineno-0-900"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">&quot;Cannot generate texts because output files do not exist.&quot;</span><span class="p">)</span>
+</span><span id="__span-0-901"><a id="__codelineno-0-901" name="__codelineno-0-901"></a>
+</span><span id="__span-0-902"><a id="__codelineno-0-902" name="__codelineno-0-902"></a>        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
+</span><span id="__span-0-903"><a id="__codelineno-0-903" name="__codelineno-0-903"></a>            <span class="s2">&quot;Texts generated: </span><span class="si">%s</span><span class="s2"> (expected size: </span><span class="si">%s</span><span class="s2">; offset: </span><span class="si">%s</span><span class="s2">; limit: </span><span class="si">%s</span><span class="s2">;)&quot;</span><span class="p">,</span> <span class="n">rows</span><span class="p">,</span> <span class="n">limit</span> <span class="o">-</span> <span class="n">offset</span><span class="p">,</span> <span class="n">offset</span><span class="p">,</span> <span class="n">limit</span>
+</span><span id="__span-0-904"><a id="__codelineno-0-904" name="__codelineno-0-904"></a>        <span class="p">)</span>
+</span><span id="__span-0-905"><a id="__codelineno-0-905" name="__codelineno-0-905"></a>
+</span><span id="__span-0-906"><a id="__codelineno-0-906" name="__codelineno-0-906"></a>    <span class="k">def</span> <span class="nf">get_estimated_bytes_from_output</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">shuffled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">read_first_n_rows</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1_000</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
+</span><span id="__span-0-907"><a id="__codelineno-0-907" name="__codelineno-0-907"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;Estimate byte size of output text:</span>
+</span><span id="__span-0-908"><a id="__codelineno-0-908" name="__codelineno-0-908"></a><span class="sd">        - read first N rows of shuffled output files and count their byte size</span>
+</span><span id="__span-0-909"><a id="__codelineno-0-909" name="__codelineno-0-909"></a><span class="sd">        - multiply counted bytes by total number of rows</span>
+</span><span id="__span-0-910"><a id="__codelineno-0-910" name="__codelineno-0-910"></a><span class="sd">        &quot;&quot;&quot;</span>
+</span><span id="__span-0-911"><a id="__codelineno-0-911" name="__codelineno-0-911"></a>        <span class="k">if</span> <span class="ow">not</span> <span class="n">shuffled</span><span class="p">:</span>
+</span><span id="__span-0-912"><a id="__codelineno-0-912" name="__codelineno-0-912"></a>            <span class="k">raise</span> <span class="ne">NotImplementedError</span>
+</span><span id="__span-0-913"><a id="__codelineno-0-913" name="__codelineno-0-913"></a>
+</span><span id="__span-0-914"><a id="__codelineno-0-914" name="__codelineno-0-914"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">!=</span> <span class="s2">&quot;parquet&quot;</span><span class="p">:</span>
+</span><span id="__span-0-915"><a id="__codelineno-0-915" name="__codelineno-0-915"></a>            <span class="k">raise</span> <span class="ne">NotImplementedError</span>
+</span><span id="__span-0-916"><a id="__codelineno-0-916" name="__codelineno-0-916"></a>
+</span><span id="__span-0-917"><a id="__codelineno-0-917" name="__codelineno-0-917"></a>        <span class="n">bytes_sum</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-918"><a id="__codelineno-0-918" name="__codelineno-0-918"></a>        <span class="n">total_rows</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-919"><a id="__codelineno-0-919" name="__codelineno-0-919"></a>
+</span><span id="__span-0-920"><a id="__codelineno-0-920" name="__codelineno-0-920"></a>        <span class="c1"># iterate over output files (use shuffled files for a better estimate)</span>
+</span><span id="__span-0-921"><a id="__codelineno-0-921" name="__codelineno-0-921"></a>        <span class="k">for</span> <span class="n">output_path</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">):</span>
+</span><span id="__span-0-922"><a id="__codelineno-0-922" name="__codelineno-0-922"></a>            <span class="k">if</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">output_path</span><span class="p">):</span>
+</span><span id="__span-0-923"><a id="__codelineno-0-923" name="__codelineno-0-923"></a>                <span class="c1"># read the first n rows</span>
+</span><span id="__span-0-924"><a id="__codelineno-0-924" name="__codelineno-0-924"></a>                <span class="n">df</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">scan_parquet</span><span class="p">(</span>
+</span><span id="__span-0-925"><a id="__codelineno-0-925" name="__codelineno-0-925"></a>                    <span class="n">output_path</span><span class="p">,</span>
+</span><span id="__span-0-926"><a id="__codelineno-0-926" name="__codelineno-0-926"></a>                    <span class="n">low_memory</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+</span><span id="__span-0-927"><a id="__codelineno-0-927" name="__codelineno-0-927"></a>                    <span class="n">n_rows</span><span class="o">=</span><span class="n">read_first_n_rows</span><span class="p">,</span>
+</span><span id="__span-0-928"><a id="__codelineno-0-928" name="__codelineno-0-928"></a>                <span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">(</span><span class="n">streaming</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+</span><span id="__span-0-929"><a id="__codelineno-0-929" name="__codelineno-0-929"></a>                <span class="k">for</span> <span class="n">row</span> <span class="ow">in</span> <span class="n">df</span><span class="o">.</span><span class="n">iter_rows</span><span class="p">():</span>
+</span><span id="__span-0-930"><a id="__codelineno-0-930" name="__codelineno-0-930"></a>                    <span class="n">text</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">row</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
+</span><span id="__span-0-931"><a id="__codelineno-0-931" name="__codelineno-0-931"></a>                    <span class="n">bytes_sum</span> <span class="o">+=</span> <span class="nb">len</span><span class="p">(</span><span class="n">text</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">&quot;utf-8&quot;</span><span class="p">))</span>  <span class="c1"># count the byte size of the text</span>
+</span><span id="__span-0-932"><a id="__codelineno-0-932" name="__codelineno-0-932"></a>
+</span><span id="__span-0-933"><a id="__codelineno-0-933" name="__codelineno-0-933"></a>                <span class="c1"># read total row count from metadata</span>
+</span><span id="__span-0-934"><a id="__codelineno-0-934" name="__codelineno-0-934"></a>                <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">output_path</span><span class="p">,</span> <span class="s2">&quot;rb&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+</span><span id="__span-0-935"><a id="__codelineno-0-935" name="__codelineno-0-935"></a>                    <span class="n">parquet_file</span> <span class="o">=</span> <span class="n">pq</span><span class="o">.</span><span class="n">ParquetFile</span><span class="p">(</span>
+</span><span id="__span-0-936"><a id="__codelineno-0-936" name="__codelineno-0-936"></a>                        <span class="n">f</span><span class="p">,</span>
+</span><span id="__span-0-937"><a id="__codelineno-0-937" name="__codelineno-0-937"></a>                        <span class="c1"># increased to avoid OSErrors</span>
+</span><span id="__span-0-938"><a id="__codelineno-0-938" name="__codelineno-0-938"></a>                        <span class="n">thrift_string_size_limit</span><span class="o">=</span><span class="mi">1000000000</span><span class="p">,</span>  <span class="c1"># default: 100000000</span>
+</span><span id="__span-0-939"><a id="__codelineno-0-939" name="__codelineno-0-939"></a>                        <span class="n">thrift_container_size_limit</span><span class="o">=</span><span class="mi">10000000</span><span class="p">,</span>  <span class="c1"># default: 1000000</span>
+</span><span id="__span-0-940"><a id="__codelineno-0-940" name="__codelineno-0-940"></a>                    <span class="p">)</span>
+</span><span id="__span-0-941"><a id="__codelineno-0-941" name="__codelineno-0-941"></a>                    <span class="n">total_rows</span> <span class="o">+=</span> <span class="n">parquet_file</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">num_rows</span>
+</span><span id="__span-0-942"><a id="__codelineno-0-942" name="__codelineno-0-942"></a>
+</span><span id="__span-0-943"><a id="__codelineno-0-943" name="__codelineno-0-943"></a>        <span class="c1"># estimated bytes</span>
+</span><span id="__span-0-944"><a id="__codelineno-0-944" name="__codelineno-0-944"></a>        <span class="n">bytes_per_row</span> <span class="o">=</span> <span class="n">bytes_sum</span> <span class="o">/</span> <span class="n">read_first_n_rows</span>
+</span><span id="__span-0-945"><a id="__codelineno-0-945" name="__codelineno-0-945"></a>        <span class="n">total_bytes</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">total_rows</span> <span class="o">*</span> <span class="n">bytes_per_row</span><span class="p">)</span>
+</span><span id="__span-0-946"><a id="__codelineno-0-946" name="__codelineno-0-946"></a>
+</span><span id="__span-0-947"><a id="__codelineno-0-947" name="__codelineno-0-947"></a>        <span class="k">return</span> <span class="n">total_bytes</span>
+</span><span id="__span-0-948"><a id="__codelineno-0-948" name="__codelineno-0-948"></a>
+</span><span id="__span-0-949"><a id="__codelineno-0-949" name="__codelineno-0-949"></a>    <span class="k">def</span> <span class="nf">get_sampling_factor</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
+</span><span id="__span-0-950"><a id="__codelineno-0-950" name="__codelineno-0-950"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;Sampling is defined based on dataset ID, source ID, or language.&quot;&quot;&quot;</span>
+</span><span id="__span-0-951"><a id="__codelineno-0-951" name="__codelineno-0-951"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="p">:</span>
+</span><span id="__span-0-952"><a id="__codelineno-0-952" name="__codelineno-0-952"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">sampling_factor_by_dataset_id</span><span class="p">:</span>
+</span><span id="__span-0-953"><a id="__codelineno-0-953" name="__codelineno-0-953"></a>                <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">sampling_factor_by_dataset_id</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span><span class="p">]</span>
+</span><span id="__span-0-954"><a id="__codelineno-0-954" name="__codelineno-0-954"></a>
+</span><span id="__span-0-955"><a id="__codelineno-0-955" name="__codelineno-0-955"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_source_id</span><span class="p">()</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">sampling_factor_by_source_id</span><span class="p">:</span>
+</span><span id="__span-0-956"><a id="__codelineno-0-956" name="__codelineno-0-956"></a>                <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">sampling_factor_by_source_id</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">get_source_id</span><span class="p">()]</span>
+</span><span id="__span-0-957"><a id="__codelineno-0-957" name="__codelineno-0-957"></a>
+</span><span id="__span-0-958"><a id="__codelineno-0-958" name="__codelineno-0-958"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_language_code</span><span class="p">()</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">sampling_factor_by_language</span><span class="p">:</span>
+</span><span id="__span-0-959"><a id="__codelineno-0-959" name="__codelineno-0-959"></a>                <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">sampling_factor_by_language</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">get_language_code</span><span class="p">()]</span>
+</span><span id="__span-0-960"><a id="__codelineno-0-960" name="__codelineno-0-960"></a>
+</span><span id="__span-0-961"><a id="__codelineno-0-961" name="__codelineno-0-961"></a>        <span class="k">return</span> <span class="mf">1.0</span>  <span class="c1"># default factor</span>
+</span><span id="__span-0-962"><a id="__codelineno-0-962" name="__codelineno-0-962"></a>
+</span><span id="__span-0-963"><a id="__codelineno-0-963" name="__codelineno-0-963"></a>    <span class="k">def</span> <span class="nf">is_selected</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
+</span><span id="__span-0-964"><a id="__codelineno-0-964" name="__codelineno-0-964"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;Is this dataset part of selected datasets or sources?&quot;&quot;&quot;</span>
+</span><span id="__span-0-965"><a id="__codelineno-0-965" name="__codelineno-0-965"></a>        <span class="k">if</span> <span class="p">(</span>
+</span><span id="__span-0-966"><a id="__codelineno-0-966" name="__codelineno-0-966"></a>            <span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">selected_dataset_ids</span>
+</span><span id="__span-0-967"><a id="__codelineno-0-967" name="__codelineno-0-967"></a>            <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_source_id</span><span class="p">()</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">selected_source_ids</span>
+</span><span id="__span-0-968"><a id="__codelineno-0-968" name="__codelineno-0-968"></a>        <span class="p">):</span>
+</span><span id="__span-0-969"><a id="__codelineno-0-969" name="__codelineno-0-969"></a>            <span class="k">return</span> <span class="kc">True</span>
+</span><span id="__span-0-970"><a id="__codelineno-0-970" name="__codelineno-0-970"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-971"><a id="__codelineno-0-971" name="__codelineno-0-971"></a>            <span class="c1"># try fnmatch</span>
+</span><span id="__span-0-972"><a id="__codelineno-0-972" name="__codelineno-0-972"></a>            <span class="k">for</span> <span class="n">pattern</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">get_selected_dataset_ids</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">&quot;fnmatch&quot;</span><span class="p">):</span>
+</span><span id="__span-0-973"><a id="__codelineno-0-973" name="__codelineno-0-973"></a>                <span class="k">if</span> <span class="n">fnmatch</span><span class="o">.</span><span class="n">fnmatch</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span><span class="p">,</span> <span class="n">pattern</span><span class="p">):</span>
+</span><span id="__span-0-974"><a id="__codelineno-0-974" name="__codelineno-0-974"></a>                    <span class="k">return</span> <span class="kc">True</span>
+</span><span id="__span-0-975"><a id="__codelineno-0-975" name="__codelineno-0-975"></a>
+</span><span id="__span-0-976"><a id="__codelineno-0-976" name="__codelineno-0-976"></a>            <span class="k">return</span> <span class="kc">False</span>
+</span><span id="__span-0-977"><a id="__codelineno-0-977" name="__codelineno-0-977"></a>
+</span><span id="__span-0-978"><a id="__codelineno-0-978" name="__codelineno-0-978"></a>    <span class="k">def</span> <span class="nf">get_shuffled_output_file_path</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">unshuffled_output_file_path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+</span><span id="__span-0-979"><a id="__codelineno-0-979" name="__codelineno-0-979"></a>        <span class="n">output_file_name</span> <span class="o">=</span> <span class="n">Path</span><span class="p">(</span><span class="n">unshuffled_output_file_path</span><span class="p">)</span><span class="o">.</span><span class="n">name</span>
+</span><span id="__span-0-980"><a id="__codelineno-0-980" name="__codelineno-0-980"></a>
+</span><span id="__span-0-981"><a id="__codelineno-0-981" name="__codelineno-0-981"></a>        <span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span>
+</span><span id="__span-0-982"><a id="__codelineno-0-982" name="__codelineno-0-982"></a>            <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">shuffled_datasets_dir</span><span class="p">,</span> <span class="n">output_file_name</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s2">&quot;.parquet&quot;</span><span class="p">,</span> <span class="s2">&quot;.shuffled.parquet&quot;</span><span class="p">)</span>
+</span><span id="__span-0-983"><a id="__codelineno-0-983" name="__codelineno-0-983"></a>        <span class="p">)</span>
+</span><span id="__span-0-984"><a id="__codelineno-0-984" name="__codelineno-0-984"></a>
+</span><span id="__span-0-985"><a id="__codelineno-0-985" name="__codelineno-0-985"></a>    <span class="k">def</span> <span class="nf">save_stats</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-986"><a id="__codelineno-0-986" name="__codelineno-0-986"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;Save the processing statistics (counter) into a JSON file in the output directory.&quot;&quot;&quot;</span>
+</span><span id="__span-0-987"><a id="__codelineno-0-987" name="__codelineno-0-987"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">counter</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-988"><a id="__codelineno-0-988" name="__codelineno-0-988"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="s2">&quot;Cannot save statistics because none were recorded.&quot;</span><span class="p">)</span>
+</span><span id="__span-0-989"><a id="__codelineno-0-989" name="__codelineno-0-989"></a>            <span class="k">return</span>
+</span><span id="__span-0-990"><a id="__codelineno-0-990" name="__codelineno-0-990"></a>
+</span><span id="__span-0-991"><a id="__codelineno-0-991" name="__codelineno-0-991"></a>        <span class="n">date_format</span> <span class="o">=</span> <span class="s2">&quot;%Y-%m-</span><span class="si">%d</span><span class="s2">_%H%M%S&quot;</span>
+</span><span id="__span-0-992"><a id="__codelineno-0-992" name="__codelineno-0-992"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">end_time</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span>
+</span><span id="__span-0-993"><a id="__codelineno-0-993" name="__codelineno-0-993"></a>        <span class="n">short_uuid</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">uuid</span><span class="o">.</span><span class="n">uuid4</span><span class="p">())[:</span><span class="mi">5</span><span class="p">]</span>
+</span><span id="__span-0-994"><a id="__codelineno-0-994" name="__codelineno-0-994"></a>        <span class="n">stats_file_name</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;stats_</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">end_time</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">date_format</span><span class="p">)</span><span class="si">}</span><span class="s2">_</span><span class="si">{</span><span class="n">short_uuid</span><span class="si">}</span><span class="s2">.</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">get_job_id</span><span class="p">()</span><span class="si">}</span><span class="s2">.json&quot;</span>
+</span><span id="__span-0-995"><a id="__codelineno-0-995" name="__codelineno-0-995"></a>        <span class="n">stats_file_path</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_dir</span><span class="p">(),</span> <span class="n">stats_file_name</span><span class="p">)</span>
+</span><span id="__span-0-996"><a id="__codelineno-0-996" name="__codelineno-0-996"></a>
+</span><span id="__span-0-997"><a id="__codelineno-0-997" name="__codelineno-0-997"></a>        <span class="n">stats</span> <span class="o">=</span> <span class="p">{</span>
+</span><span id="__span-0-998"><a id="__codelineno-0-998" name="__codelineno-0-998"></a>            <span class="s2">&quot;counter&quot;</span><span class="p">:</span> <span class="nb">dict</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">counter</span><span class="p">),</span>
+</span><span id="__span-0-999"><a id="__codelineno-0-999" name="__codelineno-0-999"></a>            <span class="s2">&quot;start_time&quot;</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">start_time</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">date_format</span><span class="p">),</span>
+</span><span id="__span-0-1000"><a id="__codelineno-0-1000" name="__codelineno-0-1000"></a>            <span class="s2">&quot;end_time&quot;</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">end_time</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">date_format</span><span class="p">),</span>
+</span><span id="__span-0-1001"><a id="__codelineno-0-1001" name="__codelineno-0-1001"></a>            <span class="s2">&quot;job_id&quot;</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">get_job_id</span><span class="p">(),</span>
+</span><span id="__span-0-1002"><a id="__codelineno-0-1002" name="__codelineno-0-1002"></a>            <span class="c1"># &quot;config&quot;: self.config,</span>
+</span><span id="__span-0-1003"><a id="__codelineno-0-1003" name="__codelineno-0-1003"></a>        <span class="p">}</span>
+</span><span id="__span-0-1004"><a id="__codelineno-0-1004" name="__codelineno-0-1004"></a>
+</span><span id="__span-0-1005"><a id="__codelineno-0-1005" name="__codelineno-0-1005"></a>        <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">stats_file_path</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+</span><span id="__span-0-1006"><a id="__codelineno-0-1006" name="__codelineno-0-1006"></a>            <span class="n">json</span><span class="o">.</span><span class="n">dump</span><span class="p">(</span><span class="n">stats</span><span class="p">,</span> <span class="n">f</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
+</span><span id="__span-0-1007"><a id="__codelineno-0-1007" name="__codelineno-0-1007"></a>
+</span><span id="__span-0-1008"><a id="__codelineno-0-1008" name="__codelineno-0-1008"></a>        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Statistics saved to </span><span class="si">{</span><span class="n">stats_file_path</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-1009"><a id="__codelineno-0-1009" name="__codelineno-0-1009"></a>
+</span><span id="__span-0-1010"><a id="__codelineno-0-1010" name="__codelineno-0-1010"></a>        <span class="k">return</span> <span class="n">stats_file_path</span>
+</span></code></pre></div></td></tr></table></div>
+              </details>
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.base.BaseDataset.filter_documents" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">filter_documents</span><span class="p">(</span><span class="n">documents</span><span class="p">)</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Applies basic filtering on the texts before saving</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/base.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-339">339</a></span>
+<span class="normal"><a href="#__codelineno-0-340">340</a></span>
+<span class="normal"><a href="#__codelineno-0-341">341</a></span>
+<span class="normal"><a href="#__codelineno-0-342">342</a></span>
+<span class="normal"><a href="#__codelineno-0-343">343</a></span>
+<span class="normal"><a href="#__codelineno-0-344">344</a></span>
+<span class="normal"><a href="#__codelineno-0-345">345</a></span>
+<span class="normal"><a href="#__codelineno-0-346">346</a></span>
+<span class="normal"><a href="#__codelineno-0-347">347</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-339"><a id="__codelineno-0-339" name="__codelineno-0-339"></a><span class="k">def</span> <span class="nf">filter_documents</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">documents</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Document</span><span class="p">]):</span>
+</span><span id="__span-0-340"><a id="__codelineno-0-340" name="__codelineno-0-340"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Applies basic filtering on the texts before saving&quot;&quot;&quot;</span>
+</span><span id="__span-0-341"><a id="__codelineno-0-341" name="__codelineno-0-341"></a>    <span class="k">for</span> <span class="n">doc</span> <span class="ow">in</span> <span class="n">documents</span><span class="p">:</span>
+</span><span id="__span-0-342"><a id="__codelineno-0-342" name="__codelineno-0-342"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">min_length</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">doc</span><span class="o">.</span><span class="n">text</span><span class="p">)</span> <span class="o">&lt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">min_length</span><span class="p">:</span>
+</span><span id="__span-0-343"><a id="__codelineno-0-343" name="__codelineno-0-343"></a>            <span class="c1"># skip because of short text length</span>
+</span><span id="__span-0-344"><a id="__codelineno-0-344" name="__codelineno-0-344"></a>            <span class="bp">self</span><span class="o">.</span><span class="n">counter</span><span class="o">.</span><span class="n">update</span><span class="p">({</span><span class="s2">&quot;filtered_short_text&quot;</span><span class="p">:</span> <span class="mi">1</span><span class="p">})</span>
+</span><span id="__span-0-345"><a id="__codelineno-0-345" name="__codelineno-0-345"></a>            <span class="k">continue</span>
+</span><span id="__span-0-346"><a id="__codelineno-0-346" name="__codelineno-0-346"></a>
+</span><span id="__span-0-347"><a id="__codelineno-0-347" name="__codelineno-0-347"></a>        <span class="k">yield</span> <span class="n">doc</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.base.BaseDataset.filter_texts" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">filter_texts</span><span class="p">(</span><span class="n">texts</span><span class="p">)</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Applies basic filtering on the texts before saving</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/base.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-349">349</a></span>
+<span class="normal"><a href="#__codelineno-0-350">350</a></span>
+<span class="normal"><a href="#__codelineno-0-351">351</a></span>
+<span class="normal"><a href="#__codelineno-0-352">352</a></span>
+<span class="normal"><a href="#__codelineno-0-353">353</a></span>
+<span class="normal"><a href="#__codelineno-0-354">354</a></span>
+<span class="normal"><a href="#__codelineno-0-355">355</a></span>
+<span class="normal"><a href="#__codelineno-0-356">356</a></span>
+<span class="normal"><a href="#__codelineno-0-357">357</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-349"><a id="__codelineno-0-349" name="__codelineno-0-349"></a><span class="k">def</span> <span class="nf">filter_texts</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">texts</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">str</span><span class="p">]):</span>
+</span><span id="__span-0-350"><a id="__codelineno-0-350" name="__codelineno-0-350"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Applies basic filtering on the texts before saving&quot;&quot;&quot;</span>
+</span><span id="__span-0-351"><a id="__codelineno-0-351" name="__codelineno-0-351"></a>    <span class="k">for</span> <span class="n">text</span> <span class="ow">in</span> <span class="n">texts</span><span class="p">:</span>
+</span><span id="__span-0-352"><a id="__codelineno-0-352" name="__codelineno-0-352"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">min_length</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">text</span><span class="p">)</span> <span class="o">&lt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">min_length</span><span class="p">:</span>
+</span><span id="__span-0-353"><a id="__codelineno-0-353" name="__codelineno-0-353"></a>            <span class="c1"># skip because of short text length</span>
+</span><span id="__span-0-354"><a id="__codelineno-0-354" name="__codelineno-0-354"></a>            <span class="bp">self</span><span class="o">.</span><span class="n">counter</span><span class="o">.</span><span class="n">update</span><span class="p">({</span><span class="s2">&quot;filtered_short_text&quot;</span><span class="p">:</span> <span class="mi">1</span><span class="p">})</span>
+</span><span id="__span-0-355"><a id="__codelineno-0-355" name="__codelineno-0-355"></a>            <span class="k">continue</span>
+</span><span id="__span-0-356"><a id="__codelineno-0-356" name="__codelineno-0-356"></a>
+</span><span id="__span-0-357"><a id="__codelineno-0-357" name="__codelineno-0-357"></a>        <span class="k">yield</span> <span class="n">text</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.base.BaseDataset.generate_texts_from_output" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">generate_texts_from_output</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">batch_size</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">offset</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">shuffle_output_file_paths</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">reader_implementation</span><span class="o">=</span><span class="s1">&#39;pyarrow&#39;</span><span class="p">,</span> <span class="n">cast_to_py_string</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>A iterator over texts from processed output files.</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/base.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-698">698</a></span>
+<span class="normal"><a href="#__codelineno-0-699">699</a></span>
+<span class="normal"><a href="#__codelineno-0-700">700</a></span>
+<span class="normal"><a href="#__codelineno-0-701">701</a></span>
+<span class="normal"><a href="#__codelineno-0-702">702</a></span>
+<span class="normal"><a href="#__codelineno-0-703">703</a></span>
+<span class="normal"><a href="#__codelineno-0-704">704</a></span>
+<span class="normal"><a href="#__codelineno-0-705">705</a></span>
+<span class="normal"><a href="#__codelineno-0-706">706</a></span>
+<span class="normal"><a href="#__codelineno-0-707">707</a></span>
+<span class="normal"><a href="#__codelineno-0-708">708</a></span>
+<span class="normal"><a href="#__codelineno-0-709">709</a></span>
+<span class="normal"><a href="#__codelineno-0-710">710</a></span>
+<span class="normal"><a href="#__codelineno-0-711">711</a></span>
+<span class="normal"><a href="#__codelineno-0-712">712</a></span>
+<span class="normal"><a href="#__codelineno-0-713">713</a></span>
+<span class="normal"><a href="#__codelineno-0-714">714</a></span>
+<span class="normal"><a href="#__codelineno-0-715">715</a></span>
+<span class="normal"><a href="#__codelineno-0-716">716</a></span>
+<span class="normal"><a href="#__codelineno-0-717">717</a></span>
+<span class="normal"><a href="#__codelineno-0-718">718</a></span>
+<span class="normal"><a href="#__codelineno-0-719">719</a></span>
+<span class="normal"><a href="#__codelineno-0-720">720</a></span>
+<span class="normal"><a href="#__codelineno-0-721">721</a></span>
+<span class="normal"><a href="#__codelineno-0-722">722</a></span>
+<span class="normal"><a href="#__codelineno-0-723">723</a></span>
+<span class="normal"><a href="#__codelineno-0-724">724</a></span>
+<span class="normal"><a href="#__codelineno-0-725">725</a></span>
+<span class="normal"><a href="#__codelineno-0-726">726</a></span>
+<span class="normal"><a href="#__codelineno-0-727">727</a></span>
+<span class="normal"><a href="#__codelineno-0-728">728</a></span>
+<span class="normal"><a href="#__codelineno-0-729">729</a></span>
+<span class="normal"><a href="#__codelineno-0-730">730</a></span>
+<span class="normal"><a href="#__codelineno-0-731">731</a></span>
+<span class="normal"><a href="#__codelineno-0-732">732</a></span>
+<span class="normal"><a href="#__codelineno-0-733">733</a></span>
+<span class="normal"><a href="#__codelineno-0-734">734</a></span>
+<span class="normal"><a href="#__codelineno-0-735">735</a></span>
+<span class="normal"><a href="#__codelineno-0-736">736</a></span>
+<span class="normal"><a href="#__codelineno-0-737">737</a></span>
+<span class="normal"><a href="#__codelineno-0-738">738</a></span>
+<span class="normal"><a href="#__codelineno-0-739">739</a></span>
+<span class="normal"><a href="#__codelineno-0-740">740</a></span>
+<span class="normal"><a href="#__codelineno-0-741">741</a></span>
+<span class="normal"><a href="#__codelineno-0-742">742</a></span>
+<span class="normal"><a href="#__codelineno-0-743">743</a></span>
+<span class="normal"><a href="#__codelineno-0-744">744</a></span>
+<span class="normal"><a href="#__codelineno-0-745">745</a></span>
+<span class="normal"><a href="#__codelineno-0-746">746</a></span>
+<span class="normal"><a href="#__codelineno-0-747">747</a></span>
+<span class="normal"><a href="#__codelineno-0-748">748</a></span>
+<span class="normal"><a href="#__codelineno-0-749">749</a></span>
+<span class="normal"><a href="#__codelineno-0-750">750</a></span>
+<span class="normal"><a href="#__codelineno-0-751">751</a></span>
+<span class="normal"><a href="#__codelineno-0-752">752</a></span>
+<span class="normal"><a href="#__codelineno-0-753">753</a></span>
+<span class="normal"><a href="#__codelineno-0-754">754</a></span>
+<span class="normal"><a href="#__codelineno-0-755">755</a></span>
+<span class="normal"><a href="#__codelineno-0-756">756</a></span>
+<span class="normal"><a href="#__codelineno-0-757">757</a></span>
+<span class="normal"><a href="#__codelineno-0-758">758</a></span>
+<span class="normal"><a href="#__codelineno-0-759">759</a></span>
+<span class="normal"><a href="#__codelineno-0-760">760</a></span>
+<span class="normal"><a href="#__codelineno-0-761">761</a></span>
+<span class="normal"><a href="#__codelineno-0-762">762</a></span>
+<span class="normal"><a href="#__codelineno-0-763">763</a></span>
+<span class="normal"><a href="#__codelineno-0-764">764</a></span>
+<span class="normal"><a href="#__codelineno-0-765">765</a></span>
+<span class="normal"><a href="#__codelineno-0-766">766</a></span>
+<span class="normal"><a href="#__codelineno-0-767">767</a></span>
+<span class="normal"><a href="#__codelineno-0-768">768</a></span>
+<span class="normal"><a href="#__codelineno-0-769">769</a></span>
+<span class="normal"><a href="#__codelineno-0-770">770</a></span>
+<span class="normal"><a href="#__codelineno-0-771">771</a></span>
+<span class="normal"><a href="#__codelineno-0-772">772</a></span>
+<span class="normal"><a href="#__codelineno-0-773">773</a></span>
+<span class="normal"><a href="#__codelineno-0-774">774</a></span>
+<span class="normal"><a href="#__codelineno-0-775">775</a></span>
+<span class="normal"><a href="#__codelineno-0-776">776</a></span>
+<span class="normal"><a href="#__codelineno-0-777">777</a></span>
+<span class="normal"><a href="#__codelineno-0-778">778</a></span>
+<span class="normal"><a href="#__codelineno-0-779">779</a></span>
+<span class="normal"><a href="#__codelineno-0-780">780</a></span>
+<span class="normal"><a href="#__codelineno-0-781">781</a></span>
+<span class="normal"><a href="#__codelineno-0-782">782</a></span>
+<span class="normal"><a href="#__codelineno-0-783">783</a></span>
+<span class="normal"><a href="#__codelineno-0-784">784</a></span>
+<span class="normal"><a href="#__codelineno-0-785">785</a></span>
+<span class="normal"><a href="#__codelineno-0-786">786</a></span>
+<span class="normal"><a href="#__codelineno-0-787">787</a></span>
+<span class="normal"><a href="#__codelineno-0-788">788</a></span>
+<span class="normal"><a href="#__codelineno-0-789">789</a></span>
+<span class="normal"><a href="#__codelineno-0-790">790</a></span>
+<span class="normal"><a href="#__codelineno-0-791">791</a></span>
+<span class="normal"><a href="#__codelineno-0-792">792</a></span>
+<span class="normal"><a href="#__codelineno-0-793">793</a></span>
+<span class="normal"><a href="#__codelineno-0-794">794</a></span>
+<span class="normal"><a href="#__codelineno-0-795">795</a></span>
+<span class="normal"><a href="#__codelineno-0-796">796</a></span>
+<span class="normal"><a href="#__codelineno-0-797">797</a></span>
+<span class="normal"><a href="#__codelineno-0-798">798</a></span>
+<span class="normal"><a href="#__codelineno-0-799">799</a></span>
+<span class="normal"><a href="#__codelineno-0-800">800</a></span>
+<span class="normal"><a href="#__codelineno-0-801">801</a></span>
+<span class="normal"><a href="#__codelineno-0-802">802</a></span>
+<span class="normal"><a href="#__codelineno-0-803">803</a></span>
+<span class="normal"><a href="#__codelineno-0-804">804</a></span>
+<span class="normal"><a href="#__codelineno-0-805">805</a></span>
+<span class="normal"><a href="#__codelineno-0-806">806</a></span>
+<span class="normal"><a href="#__codelineno-0-807">807</a></span>
+<span class="normal"><a href="#__codelineno-0-808">808</a></span>
+<span class="normal"><a href="#__codelineno-0-809">809</a></span>
+<span class="normal"><a href="#__codelineno-0-810">810</a></span>
+<span class="normal"><a href="#__codelineno-0-811">811</a></span>
+<span class="normal"><a href="#__codelineno-0-812">812</a></span>
+<span class="normal"><a href="#__codelineno-0-813">813</a></span>
+<span class="normal"><a href="#__codelineno-0-814">814</a></span>
+<span class="normal"><a href="#__codelineno-0-815">815</a></span>
+<span class="normal"><a href="#__codelineno-0-816">816</a></span>
+<span class="normal"><a href="#__codelineno-0-817">817</a></span>
+<span class="normal"><a href="#__codelineno-0-818">818</a></span>
+<span class="normal"><a href="#__codelineno-0-819">819</a></span>
+<span class="normal"><a href="#__codelineno-0-820">820</a></span>
+<span class="normal"><a href="#__codelineno-0-821">821</a></span>
+<span class="normal"><a href="#__codelineno-0-822">822</a></span>
+<span class="normal"><a href="#__codelineno-0-823">823</a></span>
+<span class="normal"><a href="#__codelineno-0-824">824</a></span>
+<span class="normal"><a href="#__codelineno-0-825">825</a></span>
+<span class="normal"><a href="#__codelineno-0-826">826</a></span>
+<span class="normal"><a href="#__codelineno-0-827">827</a></span>
+<span class="normal"><a href="#__codelineno-0-828">828</a></span>
+<span class="normal"><a href="#__codelineno-0-829">829</a></span>
+<span class="normal"><a href="#__codelineno-0-830">830</a></span>
+<span class="normal"><a href="#__codelineno-0-831">831</a></span>
+<span class="normal"><a href="#__codelineno-0-832">832</a></span>
+<span class="normal"><a href="#__codelineno-0-833">833</a></span>
+<span class="normal"><a href="#__codelineno-0-834">834</a></span>
+<span class="normal"><a href="#__codelineno-0-835">835</a></span>
+<span class="normal"><a href="#__codelineno-0-836">836</a></span>
+<span class="normal"><a href="#__codelineno-0-837">837</a></span>
+<span class="normal"><a href="#__codelineno-0-838">838</a></span>
+<span class="normal"><a href="#__codelineno-0-839">839</a></span>
+<span class="normal"><a href="#__codelineno-0-840">840</a></span>
+<span class="normal"><a href="#__codelineno-0-841">841</a></span>
+<span class="normal"><a href="#__codelineno-0-842">842</a></span>
+<span class="normal"><a href="#__codelineno-0-843">843</a></span>
+<span class="normal"><a href="#__codelineno-0-844">844</a></span>
+<span class="normal"><a href="#__codelineno-0-845">845</a></span>
+<span class="normal"><a href="#__codelineno-0-846">846</a></span>
+<span class="normal"><a href="#__codelineno-0-847">847</a></span>
+<span class="normal"><a href="#__codelineno-0-848">848</a></span>
+<span class="normal"><a href="#__codelineno-0-849">849</a></span>
+<span class="normal"><a href="#__codelineno-0-850">850</a></span>
+<span class="normal"><a href="#__codelineno-0-851">851</a></span>
+<span class="normal"><a href="#__codelineno-0-852">852</a></span>
+<span class="normal"><a href="#__codelineno-0-853">853</a></span>
+<span class="normal"><a href="#__codelineno-0-854">854</a></span>
+<span class="normal"><a href="#__codelineno-0-855">855</a></span>
+<span class="normal"><a href="#__codelineno-0-856">856</a></span>
+<span class="normal"><a href="#__codelineno-0-857">857</a></span>
+<span class="normal"><a href="#__codelineno-0-858">858</a></span>
+<span class="normal"><a href="#__codelineno-0-859">859</a></span>
+<span class="normal"><a href="#__codelineno-0-860">860</a></span>
+<span class="normal"><a href="#__codelineno-0-861">861</a></span>
+<span class="normal"><a href="#__codelineno-0-862">862</a></span>
+<span class="normal"><a href="#__codelineno-0-863">863</a></span>
+<span class="normal"><a href="#__codelineno-0-864">864</a></span>
+<span class="normal"><a href="#__codelineno-0-865">865</a></span>
+<span class="normal"><a href="#__codelineno-0-866">866</a></span>
+<span class="normal"><a href="#__codelineno-0-867">867</a></span>
+<span class="normal"><a href="#__codelineno-0-868">868</a></span>
+<span class="normal"><a href="#__codelineno-0-869">869</a></span>
+<span class="normal"><a href="#__codelineno-0-870">870</a></span>
+<span class="normal"><a href="#__codelineno-0-871">871</a></span>
+<span class="normal"><a href="#__codelineno-0-872">872</a></span>
+<span class="normal"><a href="#__codelineno-0-873">873</a></span>
+<span class="normal"><a href="#__codelineno-0-874">874</a></span>
+<span class="normal"><a href="#__codelineno-0-875">875</a></span>
+<span class="normal"><a href="#__codelineno-0-876">876</a></span>
+<span class="normal"><a href="#__codelineno-0-877">877</a></span>
+<span class="normal"><a href="#__codelineno-0-878">878</a></span>
+<span class="normal"><a href="#__codelineno-0-879">879</a></span>
+<span class="normal"><a href="#__codelineno-0-880">880</a></span>
+<span class="normal"><a href="#__codelineno-0-881">881</a></span>
+<span class="normal"><a href="#__codelineno-0-882">882</a></span>
+<span class="normal"><a href="#__codelineno-0-883">883</a></span>
+<span class="normal"><a href="#__codelineno-0-884">884</a></span>
+<span class="normal"><a href="#__codelineno-0-885">885</a></span>
+<span class="normal"><a href="#__codelineno-0-886">886</a></span>
+<span class="normal"><a href="#__codelineno-0-887">887</a></span>
+<span class="normal"><a href="#__codelineno-0-888">888</a></span>
+<span class="normal"><a href="#__codelineno-0-889">889</a></span>
+<span class="normal"><a href="#__codelineno-0-890">890</a></span>
+<span class="normal"><a href="#__codelineno-0-891">891</a></span>
+<span class="normal"><a href="#__codelineno-0-892">892</a></span>
+<span class="normal"><a href="#__codelineno-0-893">893</a></span>
+<span class="normal"><a href="#__codelineno-0-894">894</a></span>
+<span class="normal"><a href="#__codelineno-0-895">895</a></span>
+<span class="normal"><a href="#__codelineno-0-896">896</a></span>
+<span class="normal"><a href="#__codelineno-0-897">897</a></span>
+<span class="normal"><a href="#__codelineno-0-898">898</a></span>
+<span class="normal"><a href="#__codelineno-0-899">899</a></span>
+<span class="normal"><a href="#__codelineno-0-900">900</a></span>
+<span class="normal"><a href="#__codelineno-0-901">901</a></span>
+<span class="normal"><a href="#__codelineno-0-902">902</a></span>
+<span class="normal"><a href="#__codelineno-0-903">903</a></span>
+<span class="normal"><a href="#__codelineno-0-904">904</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-698"><a id="__codelineno-0-698" name="__codelineno-0-698"></a><span class="k">def</span> <span class="nf">generate_texts_from_output</span><span class="p">(</span>
+</span><span id="__span-0-699"><a id="__codelineno-0-699" name="__codelineno-0-699"></a>    <span class="bp">self</span><span class="p">,</span>
+</span><span id="__span-0-700"><a id="__codelineno-0-700" name="__codelineno-0-700"></a>    <span class="n">shuffled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+</span><span id="__span-0-701"><a id="__codelineno-0-701" name="__codelineno-0-701"></a>    <span class="n">batch_size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+</span><span id="__span-0-702"><a id="__codelineno-0-702" name="__codelineno-0-702"></a>    <span class="n">limit</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
+</span><span id="__span-0-703"><a id="__codelineno-0-703" name="__codelineno-0-703"></a>    <span class="n">offset</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
+</span><span id="__span-0-704"><a id="__codelineno-0-704" name="__codelineno-0-704"></a>    <span class="n">shuffle_output_file_paths</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+</span><span id="__span-0-705"><a id="__codelineno-0-705" name="__codelineno-0-705"></a>    <span class="n">reader_implementation</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span><span class="s2">&quot;polars_read_parquet&quot;</span><span class="p">,</span> <span class="s2">&quot;pyarrow&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;pyarrow&quot;</span><span class="p">,</span>
+</span><span id="__span-0-706"><a id="__codelineno-0-706" name="__codelineno-0-706"></a>    <span class="n">cast_to_py_string</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+</span><span id="__span-0-707"><a id="__codelineno-0-707" name="__codelineno-0-707"></a><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">StringScalar</span><span class="p">]]:</span>
+</span><span id="__span-0-708"><a id="__codelineno-0-708" name="__codelineno-0-708"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;A iterator over texts from processed output files.&quot;&quot;&quot;</span>
+</span><span id="__span-0-709"><a id="__codelineno-0-709" name="__codelineno-0-709"></a>    <span class="k">if</span> <span class="n">batch_size</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-710"><a id="__codelineno-0-710" name="__codelineno-0-710"></a>        <span class="n">batch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_batch_size</span>
+</span><span id="__span-0-711"><a id="__codelineno-0-711" name="__codelineno-0-711"></a>
+</span><span id="__span-0-712"><a id="__codelineno-0-712" name="__codelineno-0-712"></a>    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">!=</span> <span class="s2">&quot;parquet&quot;</span><span class="p">:</span>
+</span><span id="__span-0-713"><a id="__codelineno-0-713" name="__codelineno-0-713"></a>        <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Cannot generate texts with </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">output_format</span><span class="si">=}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-714"><a id="__codelineno-0-714" name="__codelineno-0-714"></a>
+</span><span id="__span-0-715"><a id="__codelineno-0-715" name="__codelineno-0-715"></a>    <span class="c1"># Check if output files exists and sort them</span>
+</span><span id="__span-0-716"><a id="__codelineno-0-716" name="__codelineno-0-716"></a>    <span class="n">output_paths</span> <span class="o">=</span> <span class="p">[</span>
+</span><span id="__span-0-717"><a id="__codelineno-0-717" name="__codelineno-0-717"></a>        <span class="n">file_path</span>
+</span><span id="__span-0-718"><a id="__codelineno-0-718" name="__codelineno-0-718"></a>        <span class="k">for</span> <span class="n">file_path</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">))</span>
+</span><span id="__span-0-719"><a id="__codelineno-0-719" name="__codelineno-0-719"></a>        <span class="k">if</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span>
+</span><span id="__span-0-720"><a id="__codelineno-0-720" name="__codelineno-0-720"></a>    <span class="p">]</span>
+</span><span id="__span-0-721"><a id="__codelineno-0-721" name="__codelineno-0-721"></a>
+</span><span id="__span-0-722"><a id="__codelineno-0-722" name="__codelineno-0-722"></a>    <span class="c1"># Count generated rows</span>
+</span><span id="__span-0-723"><a id="__codelineno-0-723" name="__codelineno-0-723"></a>    <span class="n">rows</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-724"><a id="__codelineno-0-724" name="__codelineno-0-724"></a>    <span class="n">rows_limit</span> <span class="o">=</span> <span class="n">limit</span> <span class="o">-</span> <span class="n">offset</span>
+</span><span id="__span-0-725"><a id="__codelineno-0-725" name="__codelineno-0-725"></a>
+</span><span id="__span-0-726"><a id="__codelineno-0-726" name="__codelineno-0-726"></a>    <span class="c1"># if limit &gt; 0:</span>
+</span><span id="__span-0-727"><a id="__codelineno-0-727" name="__codelineno-0-727"></a>    <span class="c1">#     batch_size = min(batch_size, limit)</span>
+</span><span id="__span-0-728"><a id="__codelineno-0-728" name="__codelineno-0-728"></a>
+</span><span id="__span-0-729"><a id="__codelineno-0-729" name="__codelineno-0-729"></a>    <span class="c1"># Shuffle output chunks:</span>
+</span><span id="__span-0-730"><a id="__codelineno-0-730" name="__codelineno-0-730"></a>    <span class="c1"># This changes the order in that the chunks are read ensure also shuffling on the full dataset level.</span>
+</span><span id="__span-0-731"><a id="__codelineno-0-731" name="__codelineno-0-731"></a>    <span class="k">if</span> <span class="n">shuffle_output_file_paths</span><span class="p">:</span>
+</span><span id="__span-0-732"><a id="__codelineno-0-732" name="__codelineno-0-732"></a>        <span class="n">random</span><span class="o">.</span><span class="n">seed</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">seed</span><span class="p">)</span>  <span class="c1"># reset seed to avoid inference by other scripts</span>
+</span><span id="__span-0-733"><a id="__codelineno-0-733" name="__codelineno-0-733"></a>        <span class="n">random</span><span class="o">.</span><span class="n">shuffle</span><span class="p">(</span><span class="n">output_paths</span><span class="p">)</span>
+</span><span id="__span-0-734"><a id="__codelineno-0-734" name="__codelineno-0-734"></a>
+</span><span id="__span-0-735"><a id="__codelineno-0-735" name="__codelineno-0-735"></a>    <span class="n">chunk_start</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-736"><a id="__codelineno-0-736" name="__codelineno-0-736"></a>    <span class="n">chunk_end</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-737"><a id="__codelineno-0-737" name="__codelineno-0-737"></a>
+</span><span id="__span-0-738"><a id="__codelineno-0-738" name="__codelineno-0-738"></a>    <span class="k">if</span> <span class="n">output_paths</span><span class="p">:</span>
+</span><span id="__span-0-739"><a id="__codelineno-0-739" name="__codelineno-0-739"></a>        <span class="k">for</span> <span class="n">file_path</span> <span class="ow">in</span> <span class="n">output_paths</span><span class="p">:</span>
+</span><span id="__span-0-740"><a id="__codelineno-0-740" name="__codelineno-0-740"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Generating text from </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">file_path</span><span class="p">)</span>
+</span><span id="__span-0-741"><a id="__codelineno-0-741" name="__codelineno-0-741"></a>
+</span><span id="__span-0-742"><a id="__codelineno-0-742" name="__codelineno-0-742"></a>            <span class="c1"># PyArrow implementation</span>
+</span><span id="__span-0-743"><a id="__codelineno-0-743" name="__codelineno-0-743"></a>            <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">file_path</span><span class="p">,</span> <span class="s2">&quot;rb&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">file_handler</span><span class="p">:</span>
+</span><span id="__span-0-744"><a id="__codelineno-0-744" name="__codelineno-0-744"></a>                <span class="n">pq_file</span> <span class="o">=</span> <span class="n">pq</span><span class="o">.</span><span class="n">ParquetFile</span><span class="p">(</span>
+</span><span id="__span-0-745"><a id="__codelineno-0-745" name="__codelineno-0-745"></a>                    <span class="n">file_handler</span><span class="p">,</span>
+</span><span id="__span-0-746"><a id="__codelineno-0-746" name="__codelineno-0-746"></a>                    <span class="c1"># memory_map=False,</span>
+</span><span id="__span-0-747"><a id="__codelineno-0-747" name="__codelineno-0-747"></a>                <span class="p">)</span>
+</span><span id="__span-0-748"><a id="__codelineno-0-748" name="__codelineno-0-748"></a>                <span class="n">file_rows_count</span> <span class="o">=</span> <span class="n">pq_file</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">num_rows</span>
+</span><span id="__span-0-749"><a id="__codelineno-0-749" name="__codelineno-0-749"></a>
+</span><span id="__span-0-750"><a id="__codelineno-0-750" name="__codelineno-0-750"></a>                <span class="n">chunk_end</span> <span class="o">=</span> <span class="n">chunk_start</span> <span class="o">+</span> <span class="n">file_rows_count</span> <span class="o">-</span> <span class="mi">1</span>
+</span><span id="__span-0-751"><a id="__codelineno-0-751" name="__codelineno-0-751"></a>
+</span><span id="__span-0-752"><a id="__codelineno-0-752" name="__codelineno-0-752"></a>                <span class="c1"># Should we read from the current chunk?</span>
+</span><span id="__span-0-753"><a id="__codelineno-0-753" name="__codelineno-0-753"></a>                <span class="c1"># Yes, if</span>
+</span><span id="__span-0-754"><a id="__codelineno-0-754" name="__codelineno-0-754"></a>                <span class="c1"># - offset is smaller or equal chunk_start</span>
+</span><span id="__span-0-755"><a id="__codelineno-0-755" name="__codelineno-0-755"></a>                <span class="c1"># (- limit is greater or equal chunk_end) --- limit does not matter</span>
+</span><span id="__span-0-756"><a id="__codelineno-0-756" name="__codelineno-0-756"></a>
+</span><span id="__span-0-757"><a id="__codelineno-0-757" name="__codelineno-0-757"></a>                <span class="c1"># variants</span>
+</span><span id="__span-0-758"><a id="__codelineno-0-758" name="__codelineno-0-758"></a>                <span class="c1"># A) requested rows start in chunk and ends in chunk</span>
+</span><span id="__span-0-759"><a id="__codelineno-0-759" name="__codelineno-0-759"></a>                <span class="c1"># B) requested rows start in chunk but ends in following chunk</span>
+</span><span id="__span-0-760"><a id="__codelineno-0-760" name="__codelineno-0-760"></a>                <span class="c1"># C) requested rows start before chunk and ends in chunk</span>
+</span><span id="__span-0-761"><a id="__codelineno-0-761" name="__codelineno-0-761"></a>                <span class="c1"># D) requested rows start before chunk and ends in following chunk</span>
+</span><span id="__span-0-762"><a id="__codelineno-0-762" name="__codelineno-0-762"></a>
+</span><span id="__span-0-763"><a id="__codelineno-0-763" name="__codelineno-0-763"></a>                <span class="k">if</span> <span class="p">(</span>
+</span><span id="__span-0-764"><a id="__codelineno-0-764" name="__codelineno-0-764"></a>                    <span class="n">chunk_start</span> <span class="o">&lt;=</span> <span class="n">offset</span> <span class="o">&lt;</span> <span class="n">chunk_end</span>
+</span><span id="__span-0-765"><a id="__codelineno-0-765" name="__codelineno-0-765"></a>                    <span class="ow">or</span> <span class="n">offset</span> <span class="o">&lt;</span> <span class="n">chunk_start</span>
+</span><span id="__span-0-766"><a id="__codelineno-0-766" name="__codelineno-0-766"></a>                    <span class="ow">and</span> <span class="p">(</span><span class="n">limit</span> <span class="o">==</span> <span class="mi">0</span> <span class="ow">or</span> <span class="n">chunk_start</span> <span class="o">&lt;</span> <span class="n">limit</span><span class="p">)</span>
+</span><span id="__span-0-767"><a id="__codelineno-0-767" name="__codelineno-0-767"></a>                <span class="p">):</span>
+</span><span id="__span-0-768"><a id="__codelineno-0-768" name="__codelineno-0-768"></a>                    <span class="n">file_offset</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span>
+</span><span id="__span-0-769"><a id="__codelineno-0-769" name="__codelineno-0-769"></a>                        <span class="mi">0</span><span class="p">,</span> <span class="n">offset</span> <span class="o">-</span> <span class="n">chunk_start</span>
+</span><span id="__span-0-770"><a id="__codelineno-0-770" name="__codelineno-0-770"></a>                    <span class="p">)</span>  <span class="c1"># global offset minus start of current file (current chunk)</span>
+</span><span id="__span-0-771"><a id="__codelineno-0-771" name="__codelineno-0-771"></a>                    <span class="n">file_limit</span> <span class="o">=</span> <span class="p">(</span>
+</span><span id="__span-0-772"><a id="__codelineno-0-772" name="__codelineno-0-772"></a>                        <span class="nb">max</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">limit</span> <span class="o">-</span> <span class="n">chunk_start</span><span class="p">)</span> <span class="k">if</span> <span class="n">limit</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="k">else</span> <span class="mi">0</span>  <span class="c1"># limit - chunk_start</span>
+</span><span id="__span-0-773"><a id="__codelineno-0-773" name="__codelineno-0-773"></a>                    <span class="p">)</span>  <span class="c1"># Length of the slice: global limit minus start of current chunk</span>
+</span><span id="__span-0-774"><a id="__codelineno-0-774" name="__codelineno-0-774"></a>                    <span class="c1"># TODO before: limit - chunk_start - file_offset</span>
+</span><span id="__span-0-775"><a id="__codelineno-0-775" name="__codelineno-0-775"></a>
+</span><span id="__span-0-776"><a id="__codelineno-0-776" name="__codelineno-0-776"></a>                    <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span>
+</span><span id="__span-0-777"><a id="__codelineno-0-777" name="__codelineno-0-777"></a>                        <span class="s2">&quot;Reading file chunk from </span><span class="si">%s</span><span class="s2">: file [</span><span class="si">%s</span><span class="s2"> - </span><span class="si">%s</span><span class="s2">]; global [</span><span class="si">%s</span><span class="s2"> - </span><span class="si">%s</span><span class="s2">]; chunk [</span><span class="si">%s</span><span class="s2"> - </span><span class="si">%s</span><span class="s2">]&quot;</span><span class="p">,</span>
+</span><span id="__span-0-778"><a id="__codelineno-0-778" name="__codelineno-0-778"></a>                        <span class="n">file_path</span><span class="p">,</span>
+</span><span id="__span-0-779"><a id="__codelineno-0-779" name="__codelineno-0-779"></a>                        <span class="n">file_offset</span><span class="p">,</span>
+</span><span id="__span-0-780"><a id="__codelineno-0-780" name="__codelineno-0-780"></a>                        <span class="n">file_limit</span><span class="p">,</span>
+</span><span id="__span-0-781"><a id="__codelineno-0-781" name="__codelineno-0-781"></a>                        <span class="n">offset</span><span class="p">,</span>
+</span><span id="__span-0-782"><a id="__codelineno-0-782" name="__codelineno-0-782"></a>                        <span class="n">limit</span><span class="p">,</span>
+</span><span id="__span-0-783"><a id="__codelineno-0-783" name="__codelineno-0-783"></a>                        <span class="n">chunk_start</span><span class="p">,</span>
+</span><span id="__span-0-784"><a id="__codelineno-0-784" name="__codelineno-0-784"></a>                        <span class="n">chunk_end</span><span class="p">,</span>
+</span><span id="__span-0-785"><a id="__codelineno-0-785" name="__codelineno-0-785"></a>                    <span class="p">)</span>
+</span><span id="__span-0-786"><a id="__codelineno-0-786" name="__codelineno-0-786"></a>                    <span class="k">if</span> <span class="n">reader_implementation</span> <span class="o">==</span> <span class="s2">&quot;pyarrow&quot;</span><span class="p">:</span>
+</span><span id="__span-0-787"><a id="__codelineno-0-787" name="__codelineno-0-787"></a>                        <span class="c1"># PyArrow implementation with iter_batches</span>
+</span><span id="__span-0-788"><a id="__codelineno-0-788" name="__codelineno-0-788"></a>                        <span class="c1"># with open(file_path, &quot;rb&quot;) as file_handler:</span>
+</span><span id="__span-0-789"><a id="__codelineno-0-789" name="__codelineno-0-789"></a>                        <span class="c1">#     parquet_file = pq.ParquetFile(file_handler)</span>
+</span><span id="__span-0-790"><a id="__codelineno-0-790" name="__codelineno-0-790"></a>
+</span><span id="__span-0-791"><a id="__codelineno-0-791" name="__codelineno-0-791"></a>                        <span class="k">for</span> <span class="n">batch_idx</span><span class="p">,</span> <span class="n">pq_batch</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span>
+</span><span id="__span-0-792"><a id="__codelineno-0-792" name="__codelineno-0-792"></a>                            <span class="n">pq_file</span><span class="o">.</span><span class="n">iter_batches</span><span class="p">(</span>
+</span><span id="__span-0-793"><a id="__codelineno-0-793" name="__codelineno-0-793"></a>                                <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_text_field</span><span class="p">()],</span> <span class="n">batch_size</span><span class="o">=</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">use_threads</span><span class="o">=</span><span class="kc">False</span>
+</span><span id="__span-0-794"><a id="__codelineno-0-794" name="__codelineno-0-794"></a>                            <span class="p">)</span>
+</span><span id="__span-0-795"><a id="__codelineno-0-795" name="__codelineno-0-795"></a>                        <span class="p">):</span>
+</span><span id="__span-0-796"><a id="__codelineno-0-796" name="__codelineno-0-796"></a>                            <span class="k">for</span> <span class="n">row_idx</span><span class="p">,</span> <span class="n">text_column</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">pq_batch</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">batch_idx</span> <span class="o">*</span> <span class="n">batch_size</span><span class="p">):</span>
+</span><span id="__span-0-797"><a id="__codelineno-0-797" name="__codelineno-0-797"></a>                                <span class="k">if</span> <span class="n">row_idx</span> <span class="o">&gt;=</span> <span class="n">file_offset</span><span class="p">:</span>
+</span><span id="__span-0-798"><a id="__codelineno-0-798" name="__codelineno-0-798"></a>                                    <span class="k">if</span> <span class="n">rows_limit</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="n">rows</span> <span class="o">&gt;=</span> <span class="n">rows_limit</span><span class="p">:</span>
+</span><span id="__span-0-799"><a id="__codelineno-0-799" name="__codelineno-0-799"></a>                                        <span class="c1"># break row loop</span>
+</span><span id="__span-0-800"><a id="__codelineno-0-800" name="__codelineno-0-800"></a>                                        <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s2">&quot;break row loop&quot;</span><span class="p">)</span>
+</span><span id="__span-0-801"><a id="__codelineno-0-801" name="__codelineno-0-801"></a>                                        <span class="k">break</span>
+</span><span id="__span-0-802"><a id="__codelineno-0-802" name="__codelineno-0-802"></a>
+</span><span id="__span-0-803"><a id="__codelineno-0-803" name="__codelineno-0-803"></a>                                    <span class="n">text</span><span class="p">:</span> <span class="n">pa</span><span class="o">.</span><span class="n">StringScalar</span> <span class="o">=</span> <span class="n">text_column</span>
+</span><span id="__span-0-804"><a id="__codelineno-0-804" name="__codelineno-0-804"></a>
+</span><span id="__span-0-805"><a id="__codelineno-0-805" name="__codelineno-0-805"></a>                                    <span class="k">if</span> <span class="n">cast_to_py_string</span><span class="p">:</span>
+</span><span id="__span-0-806"><a id="__codelineno-0-806" name="__codelineno-0-806"></a>                                        <span class="c1"># cast to string</span>
+</span><span id="__span-0-807"><a id="__codelineno-0-807" name="__codelineno-0-807"></a>                                        <span class="n">text</span> <span class="o">=</span> <span class="n">text_column</span><span class="o">.</span><span class="n">as_py</span><span class="p">()</span>
+</span><span id="__span-0-808"><a id="__codelineno-0-808" name="__codelineno-0-808"></a>
+</span><span id="__span-0-809"><a id="__codelineno-0-809" name="__codelineno-0-809"></a>                                    <span class="k">yield</span> <span class="n">text</span>
+</span><span id="__span-0-810"><a id="__codelineno-0-810" name="__codelineno-0-810"></a>                                    <span class="n">rows</span> <span class="o">+=</span> <span class="mi">1</span>
+</span><span id="__span-0-811"><a id="__codelineno-0-811" name="__codelineno-0-811"></a>
+</span><span id="__span-0-812"><a id="__codelineno-0-812" name="__codelineno-0-812"></a>                            <span class="k">if</span> <span class="n">rows_limit</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="n">rows</span> <span class="o">&gt;=</span> <span class="n">rows_limit</span><span class="p">:</span>
+</span><span id="__span-0-813"><a id="__codelineno-0-813" name="__codelineno-0-813"></a>                                <span class="c1"># break batch loop</span>
+</span><span id="__span-0-814"><a id="__codelineno-0-814" name="__codelineno-0-814"></a>                                <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s2">&quot;break batch loop&quot;</span><span class="p">)</span>
+</span><span id="__span-0-815"><a id="__codelineno-0-815" name="__codelineno-0-815"></a>                                <span class="k">break</span>
+</span><span id="__span-0-816"><a id="__codelineno-0-816" name="__codelineno-0-816"></a>
+</span><span id="__span-0-817"><a id="__codelineno-0-817" name="__codelineno-0-817"></a>                        <span class="c1"># PyArrow implementation with read_row_group</span>
+</span><span id="__span-0-818"><a id="__codelineno-0-818" name="__codelineno-0-818"></a>                        <span class="c1"># with open(file_path, &quot;rb&quot;) as file_handler:</span>
+</span><span id="__span-0-819"><a id="__codelineno-0-819" name="__codelineno-0-819"></a>                        <span class="c1">#     parquet_file = pq.ParquetFile(file_handler)</span>
+</span><span id="__span-0-820"><a id="__codelineno-0-820" name="__codelineno-0-820"></a>
+</span><span id="__span-0-821"><a id="__codelineno-0-821" name="__codelineno-0-821"></a>                        <span class="c1">#     # 1. What row groups need to be read?</span>
+</span><span id="__span-0-822"><a id="__codelineno-0-822" name="__codelineno-0-822"></a>                        <span class="c1">#     row_groups, group_idx_to_offset_last_row = get_selected_row_groups(</span>
+</span><span id="__span-0-823"><a id="__codelineno-0-823" name="__codelineno-0-823"></a>                        <span class="c1">#         parquet_file, file_offset, file_limit</span>
+</span><span id="__span-0-824"><a id="__codelineno-0-824" name="__codelineno-0-824"></a>                        <span class="c1">#     )</span>
+</span><span id="__span-0-825"><a id="__codelineno-0-825" name="__codelineno-0-825"></a>                        <span class="c1">#     logger.debug(&quot;Selected row groups: %s; %s&quot;, row_groups, group_idx_to_offset_last_row)</span>
+</span><span id="__span-0-826"><a id="__codelineno-0-826" name="__codelineno-0-826"></a>
+</span><span id="__span-0-827"><a id="__codelineno-0-827" name="__codelineno-0-827"></a>                        <span class="c1">#     # 2. Read selected row groups</span>
+</span><span id="__span-0-828"><a id="__codelineno-0-828" name="__codelineno-0-828"></a>                        <span class="c1">#     for selected_row_group in row_groups:</span>
+</span><span id="__span-0-829"><a id="__codelineno-0-829" name="__codelineno-0-829"></a>                        <span class="c1">#         logger.debug(&quot;Read row group: %s&quot;, selected_row_group)</span>
+</span><span id="__span-0-830"><a id="__codelineno-0-830" name="__codelineno-0-830"></a>                        <span class="c1">#         group_table = parquet_file.read_row_group(</span>
+</span><span id="__span-0-831"><a id="__codelineno-0-831" name="__codelineno-0-831"></a>                        <span class="c1">#             selected_row_group, columns=[self.get_output_text_field()]</span>
+</span><span id="__span-0-832"><a id="__codelineno-0-832" name="__codelineno-0-832"></a>                        <span class="c1">#         )</span>
+</span><span id="__span-0-833"><a id="__codelineno-0-833" name="__codelineno-0-833"></a>
+</span><span id="__span-0-834"><a id="__codelineno-0-834" name="__codelineno-0-834"></a>                        <span class="c1">#         # What offsets and limit? (only if needed)</span>
+</span><span id="__span-0-835"><a id="__codelineno-0-835" name="__codelineno-0-835"></a>                        <span class="c1">#         if group_idx_to_offset_last_row is not None:</span>
+</span><span id="__span-0-836"><a id="__codelineno-0-836" name="__codelineno-0-836"></a>                        <span class="c1">#             group_offset, _ = group_idx_to_offset_last_row[selected_row_group]</span>
+</span><span id="__span-0-837"><a id="__codelineno-0-837" name="__codelineno-0-837"></a>
+</span><span id="__span-0-838"><a id="__codelineno-0-838" name="__codelineno-0-838"></a>                        <span class="c1">#             row_offset = max(0, file_offset - group_offset)</span>
+</span><span id="__span-0-839"><a id="__codelineno-0-839" name="__codelineno-0-839"></a>                        <span class="c1">#             logger.debug(&quot;Row group: %s; row offset: %s&quot;, selected_row_group, row_offset)</span>
+</span><span id="__span-0-840"><a id="__codelineno-0-840" name="__codelineno-0-840"></a>
+</span><span id="__span-0-841"><a id="__codelineno-0-841" name="__codelineno-0-841"></a>                        <span class="c1">#         # Iterate over rows</span>
+</span><span id="__span-0-842"><a id="__codelineno-0-842" name="__codelineno-0-842"></a>                        <span class="c1">#         for row_idx, text_column in enumerate(group_table.columns[0]):</span>
+</span><span id="__span-0-843"><a id="__codelineno-0-843" name="__codelineno-0-843"></a>                        <span class="c1">#             # Skip rows before offset</span>
+</span><span id="__span-0-844"><a id="__codelineno-0-844" name="__codelineno-0-844"></a>                        <span class="c1">#             if group_idx_to_offset_last_row is None or row_idx &gt;= row_offset:</span>
+</span><span id="__span-0-845"><a id="__codelineno-0-845" name="__codelineno-0-845"></a>                        <span class="c1">#                 if rows_limit &gt; 0 and rows &gt;= rows_limit:</span>
+</span><span id="__span-0-846"><a id="__codelineno-0-846" name="__codelineno-0-846"></a>                        <span class="c1">#                     # break row loop</span>
+</span><span id="__span-0-847"><a id="__codelineno-0-847" name="__codelineno-0-847"></a>                        <span class="c1">#                     logger.debug(&quot;break row loop&quot;)</span>
+</span><span id="__span-0-848"><a id="__codelineno-0-848" name="__codelineno-0-848"></a>                        <span class="c1">#                     break</span>
+</span><span id="__span-0-849"><a id="__codelineno-0-849" name="__codelineno-0-849"></a>
+</span><span id="__span-0-850"><a id="__codelineno-0-850" name="__codelineno-0-850"></a>                        <span class="c1">#                 text = text_column.as_py()  # cast to str</span>
+</span><span id="__span-0-851"><a id="__codelineno-0-851" name="__codelineno-0-851"></a>                        <span class="c1">#                 yield text</span>
+</span><span id="__span-0-852"><a id="__codelineno-0-852" name="__codelineno-0-852"></a>                        <span class="c1">#                 rows += 1</span>
+</span><span id="__span-0-853"><a id="__codelineno-0-853" name="__codelineno-0-853"></a>
+</span><span id="__span-0-854"><a id="__codelineno-0-854" name="__codelineno-0-854"></a>                        <span class="c1">#         if rows_limit &gt; 0 and rows &gt;= rows_limit:</span>
+</span><span id="__span-0-855"><a id="__codelineno-0-855" name="__codelineno-0-855"></a>                        <span class="c1">#             # break row group loop</span>
+</span><span id="__span-0-856"><a id="__codelineno-0-856" name="__codelineno-0-856"></a>                        <span class="c1">#             logger.debug(&quot;break row group loop&quot;)</span>
+</span><span id="__span-0-857"><a id="__codelineno-0-857" name="__codelineno-0-857"></a>                        <span class="c1">#             break</span>
+</span><span id="__span-0-858"><a id="__codelineno-0-858" name="__codelineno-0-858"></a>
+</span><span id="__span-0-859"><a id="__codelineno-0-859" name="__codelineno-0-859"></a>                    <span class="k">elif</span> <span class="n">reader_implementation</span> <span class="o">==</span> <span class="s2">&quot;polars_read_parquet&quot;</span><span class="p">:</span>
+</span><span id="__span-0-860"><a id="__codelineno-0-860" name="__codelineno-0-860"></a>                        <span class="c1"># Polars &quot;scan_parquet&quot; implementation: Error &quot;Segmentation fault (core dumped)&quot;</span>
+</span><span id="__span-0-861"><a id="__codelineno-0-861" name="__codelineno-0-861"></a>                        <span class="c1"># df = (</span>
+</span><span id="__span-0-862"><a id="__codelineno-0-862" name="__codelineno-0-862"></a>                        <span class="c1">#     pl.scan_parquet(file_path, low_memory=True).collect(</span>
+</span><span id="__span-0-863"><a id="__codelineno-0-863" name="__codelineno-0-863"></a>                        <span class="c1">#     streaming=True</span>
+</span><span id="__span-0-864"><a id="__codelineno-0-864" name="__codelineno-0-864"></a>                        <span class="c1"># ).slice(offset=file_offset, length=file_limit if file_limit != 0 else None)</span>
+</span><span id="__span-0-865"><a id="__codelineno-0-865" name="__codelineno-0-865"></a>                        <span class="c1">#     .collect(streaming=True)</span>
+</span><span id="__span-0-866"><a id="__codelineno-0-866" name="__codelineno-0-866"></a>                        <span class="c1"># )</span>
+</span><span id="__span-0-867"><a id="__codelineno-0-867" name="__codelineno-0-867"></a>                        <span class="c1"># text_column_index = df.columns.index(self.get_output_text_field())</span>
+</span><span id="__span-0-868"><a id="__codelineno-0-868" name="__codelineno-0-868"></a>
+</span><span id="__span-0-869"><a id="__codelineno-0-869" name="__codelineno-0-869"></a>                        <span class="n">df</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">read_parquet</span><span class="p">(</span>
+</span><span id="__span-0-870"><a id="__codelineno-0-870" name="__codelineno-0-870"></a>                            <span class="n">file_path</span><span class="p">,</span> <span class="n">low_memory</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_text_field</span><span class="p">()]</span>
+</span><span id="__span-0-871"><a id="__codelineno-0-871" name="__codelineno-0-871"></a>                        <span class="p">)</span><span class="o">.</span><span class="n">slice</span><span class="p">(</span><span class="n">offset</span><span class="o">=</span><span class="n">file_offset</span><span class="p">,</span> <span class="n">length</span><span class="o">=</span><span class="n">file_limit</span> <span class="k">if</span> <span class="n">file_limit</span> <span class="o">!=</span> <span class="mi">0</span> <span class="k">else</span> <span class="kc">None</span><span class="p">)</span>
+</span><span id="__span-0-872"><a id="__codelineno-0-872" name="__codelineno-0-872"></a>                        <span class="n">text_column_index</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-873"><a id="__codelineno-0-873" name="__codelineno-0-873"></a>
+</span><span id="__span-0-874"><a id="__codelineno-0-874" name="__codelineno-0-874"></a>                        <span class="c1"># Iterate over rows</span>
+</span><span id="__span-0-875"><a id="__codelineno-0-875" name="__codelineno-0-875"></a>                        <span class="k">for</span> <span class="n">row</span> <span class="ow">in</span> <span class="n">df</span><span class="o">.</span><span class="n">iter_rows</span><span class="p">():</span>
+</span><span id="__span-0-876"><a id="__codelineno-0-876" name="__codelineno-0-876"></a>                            <span class="n">text</span> <span class="o">=</span> <span class="n">row</span><span class="p">[</span><span class="n">text_column_index</span><span class="p">]</span>
+</span><span id="__span-0-877"><a id="__codelineno-0-877" name="__codelineno-0-877"></a>
+</span><span id="__span-0-878"><a id="__codelineno-0-878" name="__codelineno-0-878"></a>                            <span class="k">if</span> <span class="n">cast_to_py_string</span><span class="p">:</span>
+</span><span id="__span-0-879"><a id="__codelineno-0-879" name="__codelineno-0-879"></a>                                <span class="n">text</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
+</span><span id="__span-0-880"><a id="__codelineno-0-880" name="__codelineno-0-880"></a>
+</span><span id="__span-0-881"><a id="__codelineno-0-881" name="__codelineno-0-881"></a>                            <span class="k">yield</span> <span class="n">text</span>
+</span><span id="__span-0-882"><a id="__codelineno-0-882" name="__codelineno-0-882"></a>                            <span class="n">rows</span> <span class="o">+=</span> <span class="mi">1</span>
+</span><span id="__span-0-883"><a id="__codelineno-0-883" name="__codelineno-0-883"></a>
+</span><span id="__span-0-884"><a id="__codelineno-0-884" name="__codelineno-0-884"></a>                            <span class="k">if</span> <span class="n">rows_limit</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="n">rows</span> <span class="o">&gt;=</span> <span class="n">rows_limit</span><span class="p">:</span>
+</span><span id="__span-0-885"><a id="__codelineno-0-885" name="__codelineno-0-885"></a>                                <span class="c1"># break row loop</span>
+</span><span id="__span-0-886"><a id="__codelineno-0-886" name="__codelineno-0-886"></a>                                <span class="k">break</span>
+</span><span id="__span-0-887"><a id="__codelineno-0-887" name="__codelineno-0-887"></a>                        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-888"><a id="__codelineno-0-888" name="__codelineno-0-888"></a>                            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Invalid `reader_implementation`&quot;</span><span class="p">)</span>
+</span><span id="__span-0-889"><a id="__codelineno-0-889" name="__codelineno-0-889"></a>                <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-890"><a id="__codelineno-0-890" name="__codelineno-0-890"></a>                    <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s2">&quot;Skip this file because output does not contain the requested rows: </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">file_path</span><span class="p">)</span>
+</span><span id="__span-0-891"><a id="__codelineno-0-891" name="__codelineno-0-891"></a>
+</span><span id="__span-0-892"><a id="__codelineno-0-892" name="__codelineno-0-892"></a>                <span class="c1"># current_offset += file_rows_count  # TODO +1?</span>
+</span><span id="__span-0-893"><a id="__codelineno-0-893" name="__codelineno-0-893"></a>                <span class="n">chunk_start</span> <span class="o">=</span> <span class="n">chunk_end</span> <span class="o">+</span> <span class="mi">1</span>  <span class="c1"># set start for the next chunk</span>
+</span><span id="__span-0-894"><a id="__codelineno-0-894" name="__codelineno-0-894"></a>
+</span><span id="__span-0-895"><a id="__codelineno-0-895" name="__codelineno-0-895"></a>            <span class="k">if</span> <span class="n">rows_limit</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="n">rows</span> <span class="o">&gt;=</span> <span class="n">rows_limit</span><span class="p">:</span>
+</span><span id="__span-0-896"><a id="__codelineno-0-896" name="__codelineno-0-896"></a>                <span class="c1"># break file loop</span>
+</span><span id="__span-0-897"><a id="__codelineno-0-897" name="__codelineno-0-897"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s2">&quot;break file loop&quot;</span><span class="p">)</span>
+</span><span id="__span-0-898"><a id="__codelineno-0-898" name="__codelineno-0-898"></a>                <span class="k">break</span>
+</span><span id="__span-0-899"><a id="__codelineno-0-899" name="__codelineno-0-899"></a>    <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-900"><a id="__codelineno-0-900" name="__codelineno-0-900"></a>        <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">&quot;Cannot generate texts because output files do not exist.&quot;</span><span class="p">)</span>
+</span><span id="__span-0-901"><a id="__codelineno-0-901" name="__codelineno-0-901"></a>
+</span><span id="__span-0-902"><a id="__codelineno-0-902" name="__codelineno-0-902"></a>    <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
+</span><span id="__span-0-903"><a id="__codelineno-0-903" name="__codelineno-0-903"></a>        <span class="s2">&quot;Texts generated: </span><span class="si">%s</span><span class="s2"> (expected size: </span><span class="si">%s</span><span class="s2">; offset: </span><span class="si">%s</span><span class="s2">; limit: </span><span class="si">%s</span><span class="s2">;)&quot;</span><span class="p">,</span> <span class="n">rows</span><span class="p">,</span> <span class="n">limit</span> <span class="o">-</span> <span class="n">offset</span><span class="p">,</span> <span class="n">offset</span><span class="p">,</span> <span class="n">limit</span>
+</span><span id="__span-0-904"><a id="__codelineno-0-904" name="__codelineno-0-904"></a>    <span class="p">)</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.base.BaseDataset.get_compression_from_output_files" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">get_compression_from_output_files</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>NOTE: Currently only implemented for <code>parquet</code> format.</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/base.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-679">679</a></span>
+<span class="normal"><a href="#__codelineno-0-680">680</a></span>
+<span class="normal"><a href="#__codelineno-0-681">681</a></span>
+<span class="normal"><a href="#__codelineno-0-682">682</a></span>
+<span class="normal"><a href="#__codelineno-0-683">683</a></span>
+<span class="normal"><a href="#__codelineno-0-684">684</a></span>
+<span class="normal"><a href="#__codelineno-0-685">685</a></span>
+<span class="normal"><a href="#__codelineno-0-686">686</a></span>
+<span class="normal"><a href="#__codelineno-0-687">687</a></span>
+<span class="normal"><a href="#__codelineno-0-688">688</a></span>
+<span class="normal"><a href="#__codelineno-0-689">689</a></span>
+<span class="normal"><a href="#__codelineno-0-690">690</a></span>
+<span class="normal"><a href="#__codelineno-0-691">691</a></span>
+<span class="normal"><a href="#__codelineno-0-692">692</a></span>
+<span class="normal"><a href="#__codelineno-0-693">693</a></span>
+<span class="normal"><a href="#__codelineno-0-694">694</a></span>
+<span class="normal"><a href="#__codelineno-0-695">695</a></span>
+<span class="normal"><a href="#__codelineno-0-696">696</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-679"><a id="__codelineno-0-679" name="__codelineno-0-679"></a><span class="k">def</span> <span class="nf">get_compression_from_output_files</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
+</span><span id="__span-0-680"><a id="__codelineno-0-680" name="__codelineno-0-680"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;NOTE: Currently only implemented for `parquet` format.&quot;&quot;&quot;</span>
+</span><span id="__span-0-681"><a id="__codelineno-0-681" name="__codelineno-0-681"></a>    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">==</span> <span class="s2">&quot;parquet&quot;</span><span class="p">:</span>
+</span><span id="__span-0-682"><a id="__codelineno-0-682" name="__codelineno-0-682"></a>        <span class="k">for</span> <span class="n">output_path</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">):</span>
+</span><span id="__span-0-683"><a id="__codelineno-0-683" name="__codelineno-0-683"></a>            <span class="k">if</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">output_path</span><span class="p">):</span>
+</span><span id="__span-0-684"><a id="__codelineno-0-684" name="__codelineno-0-684"></a>                <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">output_path</span><span class="p">,</span> <span class="s2">&quot;rb&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+</span><span id="__span-0-685"><a id="__codelineno-0-685" name="__codelineno-0-685"></a>                    <span class="n">parquet_file</span> <span class="o">=</span> <span class="n">pq</span><span class="o">.</span><span class="n">ParquetFile</span><span class="p">(</span>
+</span><span id="__span-0-686"><a id="__codelineno-0-686" name="__codelineno-0-686"></a>                        <span class="n">f</span><span class="p">,</span>
+</span><span id="__span-0-687"><a id="__codelineno-0-687" name="__codelineno-0-687"></a>                        <span class="c1"># increased to avoid OSErrors</span>
+</span><span id="__span-0-688"><a id="__codelineno-0-688" name="__codelineno-0-688"></a>                        <span class="n">thrift_string_size_limit</span><span class="o">=</span><span class="mi">1000000000</span><span class="p">,</span>  <span class="c1"># default: 100000000</span>
+</span><span id="__span-0-689"><a id="__codelineno-0-689" name="__codelineno-0-689"></a>                        <span class="n">thrift_container_size_limit</span><span class="o">=</span><span class="mi">10000000</span><span class="p">,</span>  <span class="c1"># default: 1000000</span>
+</span><span id="__span-0-690"><a id="__codelineno-0-690" name="__codelineno-0-690"></a>                    <span class="p">)</span>
+</span><span id="__span-0-691"><a id="__codelineno-0-691" name="__codelineno-0-691"></a>                    <span class="n">parquet_metadata</span> <span class="o">=</span> <span class="n">parquet_file</span><span class="o">.</span><span class="n">metadata</span>
+</span><span id="__span-0-692"><a id="__codelineno-0-692" name="__codelineno-0-692"></a>                    <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">parquet_metadata</span><span class="o">.</span><span class="n">num_row_groups</span><span class="p">):</span>
+</span><span id="__span-0-693"><a id="__codelineno-0-693" name="__codelineno-0-693"></a>                        <span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">parquet_metadata</span><span class="o">.</span><span class="n">num_columns</span><span class="p">):</span>
+</span><span id="__span-0-694"><a id="__codelineno-0-694" name="__codelineno-0-694"></a>                            <span class="k">return</span> <span class="n">parquet_file</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">row_group</span><span class="p">(</span><span class="n">i</span><span class="p">)</span><span class="o">.</span><span class="n">column</span><span class="p">(</span><span class="n">j</span><span class="p">)</span><span class="o">.</span><span class="n">compression</span>
+</span><span id="__span-0-695"><a id="__codelineno-0-695" name="__codelineno-0-695"></a>
+</span><span id="__span-0-696"><a id="__codelineno-0-696" name="__codelineno-0-696"></a>    <span class="k">return</span> <span class="s2">&quot;unknown&quot;</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.base.BaseDataset.get_estimated_bytes_from_output" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">get_estimated_bytes_from_output</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">read_first_n_rows</span><span class="o">=</span><span class="mi">1000</span><span class="p">)</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Estimate byte size of output text:
+- read first N rows of shuffled output files and count their byte size
+- multiply counted bytes by total number of rows</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/base.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-906">906</a></span>
+<span class="normal"><a href="#__codelineno-0-907">907</a></span>
+<span class="normal"><a href="#__codelineno-0-908">908</a></span>
+<span class="normal"><a href="#__codelineno-0-909">909</a></span>
+<span class="normal"><a href="#__codelineno-0-910">910</a></span>
+<span class="normal"><a href="#__codelineno-0-911">911</a></span>
+<span class="normal"><a href="#__codelineno-0-912">912</a></span>
+<span class="normal"><a href="#__codelineno-0-913">913</a></span>
+<span class="normal"><a href="#__codelineno-0-914">914</a></span>
+<span class="normal"><a href="#__codelineno-0-915">915</a></span>
+<span class="normal"><a href="#__codelineno-0-916">916</a></span>
+<span class="normal"><a href="#__codelineno-0-917">917</a></span>
+<span class="normal"><a href="#__codelineno-0-918">918</a></span>
+<span class="normal"><a href="#__codelineno-0-919">919</a></span>
+<span class="normal"><a href="#__codelineno-0-920">920</a></span>
+<span class="normal"><a href="#__codelineno-0-921">921</a></span>
+<span class="normal"><a href="#__codelineno-0-922">922</a></span>
+<span class="normal"><a href="#__codelineno-0-923">923</a></span>
+<span class="normal"><a href="#__codelineno-0-924">924</a></span>
+<span class="normal"><a href="#__codelineno-0-925">925</a></span>
+<span class="normal"><a href="#__codelineno-0-926">926</a></span>
+<span class="normal"><a href="#__codelineno-0-927">927</a></span>
+<span class="normal"><a href="#__codelineno-0-928">928</a></span>
+<span class="normal"><a href="#__codelineno-0-929">929</a></span>
+<span class="normal"><a href="#__codelineno-0-930">930</a></span>
+<span class="normal"><a href="#__codelineno-0-931">931</a></span>
+<span class="normal"><a href="#__codelineno-0-932">932</a></span>
+<span class="normal"><a href="#__codelineno-0-933">933</a></span>
+<span class="normal"><a href="#__codelineno-0-934">934</a></span>
+<span class="normal"><a href="#__codelineno-0-935">935</a></span>
+<span class="normal"><a href="#__codelineno-0-936">936</a></span>
+<span class="normal"><a href="#__codelineno-0-937">937</a></span>
+<span class="normal"><a href="#__codelineno-0-938">938</a></span>
+<span class="normal"><a href="#__codelineno-0-939">939</a></span>
+<span class="normal"><a href="#__codelineno-0-940">940</a></span>
+<span class="normal"><a href="#__codelineno-0-941">941</a></span>
+<span class="normal"><a href="#__codelineno-0-942">942</a></span>
+<span class="normal"><a href="#__codelineno-0-943">943</a></span>
+<span class="normal"><a href="#__codelineno-0-944">944</a></span>
+<span class="normal"><a href="#__codelineno-0-945">945</a></span>
+<span class="normal"><a href="#__codelineno-0-946">946</a></span>
+<span class="normal"><a href="#__codelineno-0-947">947</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-906"><a id="__codelineno-0-906" name="__codelineno-0-906"></a><span class="k">def</span> <span class="nf">get_estimated_bytes_from_output</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">shuffled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">read_first_n_rows</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1_000</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
+</span><span id="__span-0-907"><a id="__codelineno-0-907" name="__codelineno-0-907"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Estimate byte size of output text:</span>
+</span><span id="__span-0-908"><a id="__codelineno-0-908" name="__codelineno-0-908"></a><span class="sd">    - read first N rows of shuffled output files and count their byte size</span>
+</span><span id="__span-0-909"><a id="__codelineno-0-909" name="__codelineno-0-909"></a><span class="sd">    - multiply counted bytes by total number of rows</span>
+</span><span id="__span-0-910"><a id="__codelineno-0-910" name="__codelineno-0-910"></a><span class="sd">    &quot;&quot;&quot;</span>
+</span><span id="__span-0-911"><a id="__codelineno-0-911" name="__codelineno-0-911"></a>    <span class="k">if</span> <span class="ow">not</span> <span class="n">shuffled</span><span class="p">:</span>
+</span><span id="__span-0-912"><a id="__codelineno-0-912" name="__codelineno-0-912"></a>        <span class="k">raise</span> <span class="ne">NotImplementedError</span>
+</span><span id="__span-0-913"><a id="__codelineno-0-913" name="__codelineno-0-913"></a>
+</span><span id="__span-0-914"><a id="__codelineno-0-914" name="__codelineno-0-914"></a>    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">!=</span> <span class="s2">&quot;parquet&quot;</span><span class="p">:</span>
+</span><span id="__span-0-915"><a id="__codelineno-0-915" name="__codelineno-0-915"></a>        <span class="k">raise</span> <span class="ne">NotImplementedError</span>
+</span><span id="__span-0-916"><a id="__codelineno-0-916" name="__codelineno-0-916"></a>
+</span><span id="__span-0-917"><a id="__codelineno-0-917" name="__codelineno-0-917"></a>    <span class="n">bytes_sum</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-918"><a id="__codelineno-0-918" name="__codelineno-0-918"></a>    <span class="n">total_rows</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-919"><a id="__codelineno-0-919" name="__codelineno-0-919"></a>
+</span><span id="__span-0-920"><a id="__codelineno-0-920" name="__codelineno-0-920"></a>    <span class="c1"># iterate over output files (use shuffled files for a better estimate)</span>
+</span><span id="__span-0-921"><a id="__codelineno-0-921" name="__codelineno-0-921"></a>    <span class="k">for</span> <span class="n">output_path</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">):</span>
+</span><span id="__span-0-922"><a id="__codelineno-0-922" name="__codelineno-0-922"></a>        <span class="k">if</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">output_path</span><span class="p">):</span>
+</span><span id="__span-0-923"><a id="__codelineno-0-923" name="__codelineno-0-923"></a>            <span class="c1"># read the first n rows</span>
+</span><span id="__span-0-924"><a id="__codelineno-0-924" name="__codelineno-0-924"></a>            <span class="n">df</span> <span class="o">=</span> <span class="n">pl</span><span class="o">.</span><span class="n">scan_parquet</span><span class="p">(</span>
+</span><span id="__span-0-925"><a id="__codelineno-0-925" name="__codelineno-0-925"></a>                <span class="n">output_path</span><span class="p">,</span>
+</span><span id="__span-0-926"><a id="__codelineno-0-926" name="__codelineno-0-926"></a>                <span class="n">low_memory</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+</span><span id="__span-0-927"><a id="__codelineno-0-927" name="__codelineno-0-927"></a>                <span class="n">n_rows</span><span class="o">=</span><span class="n">read_first_n_rows</span><span class="p">,</span>
+</span><span id="__span-0-928"><a id="__codelineno-0-928" name="__codelineno-0-928"></a>            <span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">(</span><span class="n">streaming</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+</span><span id="__span-0-929"><a id="__codelineno-0-929" name="__codelineno-0-929"></a>            <span class="k">for</span> <span class="n">row</span> <span class="ow">in</span> <span class="n">df</span><span class="o">.</span><span class="n">iter_rows</span><span class="p">():</span>
+</span><span id="__span-0-930"><a id="__codelineno-0-930" name="__codelineno-0-930"></a>                <span class="n">text</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">row</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
+</span><span id="__span-0-931"><a id="__codelineno-0-931" name="__codelineno-0-931"></a>                <span class="n">bytes_sum</span> <span class="o">+=</span> <span class="nb">len</span><span class="p">(</span><span class="n">text</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">&quot;utf-8&quot;</span><span class="p">))</span>  <span class="c1"># count the byte size of the text</span>
+</span><span id="__span-0-932"><a id="__codelineno-0-932" name="__codelineno-0-932"></a>
+</span><span id="__span-0-933"><a id="__codelineno-0-933" name="__codelineno-0-933"></a>            <span class="c1"># read total row count from metadata</span>
+</span><span id="__span-0-934"><a id="__codelineno-0-934" name="__codelineno-0-934"></a>            <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">output_path</span><span class="p">,</span> <span class="s2">&quot;rb&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+</span><span id="__span-0-935"><a id="__codelineno-0-935" name="__codelineno-0-935"></a>                <span class="n">parquet_file</span> <span class="o">=</span> <span class="n">pq</span><span class="o">.</span><span class="n">ParquetFile</span><span class="p">(</span>
+</span><span id="__span-0-936"><a id="__codelineno-0-936" name="__codelineno-0-936"></a>                    <span class="n">f</span><span class="p">,</span>
+</span><span id="__span-0-937"><a id="__codelineno-0-937" name="__codelineno-0-937"></a>                    <span class="c1"># increased to avoid OSErrors</span>
+</span><span id="__span-0-938"><a id="__codelineno-0-938" name="__codelineno-0-938"></a>                    <span class="n">thrift_string_size_limit</span><span class="o">=</span><span class="mi">1000000000</span><span class="p">,</span>  <span class="c1"># default: 100000000</span>
+</span><span id="__span-0-939"><a id="__codelineno-0-939" name="__codelineno-0-939"></a>                    <span class="n">thrift_container_size_limit</span><span class="o">=</span><span class="mi">10000000</span><span class="p">,</span>  <span class="c1"># default: 1000000</span>
+</span><span id="__span-0-940"><a id="__codelineno-0-940" name="__codelineno-0-940"></a>                <span class="p">)</span>
+</span><span id="__span-0-941"><a id="__codelineno-0-941" name="__codelineno-0-941"></a>                <span class="n">total_rows</span> <span class="o">+=</span> <span class="n">parquet_file</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">num_rows</span>
+</span><span id="__span-0-942"><a id="__codelineno-0-942" name="__codelineno-0-942"></a>
+</span><span id="__span-0-943"><a id="__codelineno-0-943" name="__codelineno-0-943"></a>    <span class="c1"># estimated bytes</span>
+</span><span id="__span-0-944"><a id="__codelineno-0-944" name="__codelineno-0-944"></a>    <span class="n">bytes_per_row</span> <span class="o">=</span> <span class="n">bytes_sum</span> <span class="o">/</span> <span class="n">read_first_n_rows</span>
+</span><span id="__span-0-945"><a id="__codelineno-0-945" name="__codelineno-0-945"></a>    <span class="n">total_bytes</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">total_rows</span> <span class="o">*</span> <span class="n">bytes_per_row</span><span class="p">)</span>
+</span><span id="__span-0-946"><a id="__codelineno-0-946" name="__codelineno-0-946"></a>
+</span><span id="__span-0-947"><a id="__codelineno-0-947" name="__codelineno-0-947"></a>    <span class="k">return</span> <span class="n">total_bytes</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.base.BaseDataset.get_output_rows_count" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">get_output_rows_count</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Read metadata from parquet files and extract number of rows</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/base.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-649">649</a></span>
+<span class="normal"><a href="#__codelineno-0-650">650</a></span>
+<span class="normal"><a href="#__codelineno-0-651">651</a></span>
+<span class="normal"><a href="#__codelineno-0-652">652</a></span>
+<span class="normal"><a href="#__codelineno-0-653">653</a></span>
+<span class="normal"><a href="#__codelineno-0-654">654</a></span>
+<span class="normal"><a href="#__codelineno-0-655">655</a></span>
+<span class="normal"><a href="#__codelineno-0-656">656</a></span>
+<span class="normal"><a href="#__codelineno-0-657">657</a></span>
+<span class="normal"><a href="#__codelineno-0-658">658</a></span>
+<span class="normal"><a href="#__codelineno-0-659">659</a></span>
+<span class="normal"><a href="#__codelineno-0-660">660</a></span>
+<span class="normal"><a href="#__codelineno-0-661">661</a></span>
+<span class="normal"><a href="#__codelineno-0-662">662</a></span>
+<span class="normal"><a href="#__codelineno-0-663">663</a></span>
+<span class="normal"><a href="#__codelineno-0-664">664</a></span>
+<span class="normal"><a href="#__codelineno-0-665">665</a></span>
+<span class="normal"><a href="#__codelineno-0-666">666</a></span>
+<span class="normal"><a href="#__codelineno-0-667">667</a></span>
+<span class="normal"><a href="#__codelineno-0-668">668</a></span>
+<span class="normal"><a href="#__codelineno-0-669">669</a></span>
+<span class="normal"><a href="#__codelineno-0-670">670</a></span>
+<span class="normal"><a href="#__codelineno-0-671">671</a></span>
+<span class="normal"><a href="#__codelineno-0-672">672</a></span>
+<span class="normal"><a href="#__codelineno-0-673">673</a></span>
+<span class="normal"><a href="#__codelineno-0-674">674</a></span>
+<span class="normal"><a href="#__codelineno-0-675">675</a></span>
+<span class="normal"><a href="#__codelineno-0-676">676</a></span>
+<span class="normal"><a href="#__codelineno-0-677">677</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-649"><a id="__codelineno-0-649" name="__codelineno-0-649"></a><span class="k">def</span> <span class="nf">get_output_rows_count</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">shuffled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
+</span><span id="__span-0-650"><a id="__codelineno-0-650" name="__codelineno-0-650"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Read metadata from parquet files and extract number of rows&quot;&quot;&quot;</span>
+</span><span id="__span-0-651"><a id="__codelineno-0-651" name="__codelineno-0-651"></a>    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">==</span> <span class="s2">&quot;parquet&quot;</span><span class="p">:</span>
+</span><span id="__span-0-652"><a id="__codelineno-0-652" name="__codelineno-0-652"></a>        <span class="n">output_paths</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">(</span><span class="n">shuffled</span><span class="o">=</span><span class="n">shuffled</span><span class="p">))</span>
+</span><span id="__span-0-653"><a id="__codelineno-0-653" name="__codelineno-0-653"></a>
+</span><span id="__span-0-654"><a id="__codelineno-0-654" name="__codelineno-0-654"></a>        <span class="c1"># Filter for existing</span>
+</span><span id="__span-0-655"><a id="__codelineno-0-655" name="__codelineno-0-655"></a>        <span class="n">output_paths</span> <span class="o">=</span> <span class="p">[</span><span class="n">output_path</span> <span class="k">for</span> <span class="n">output_path</span> <span class="ow">in</span> <span class="n">output_paths</span> <span class="k">if</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">output_path</span><span class="p">)]</span>
+</span><span id="__span-0-656"><a id="__codelineno-0-656" name="__codelineno-0-656"></a>
+</span><span id="__span-0-657"><a id="__codelineno-0-657" name="__codelineno-0-657"></a>        <span class="k">if</span> <span class="n">output_paths</span><span class="p">:</span>
+</span><span id="__span-0-658"><a id="__codelineno-0-658" name="__codelineno-0-658"></a>            <span class="n">rows_count</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-659"><a id="__codelineno-0-659" name="__codelineno-0-659"></a>
+</span><span id="__span-0-660"><a id="__codelineno-0-660" name="__codelineno-0-660"></a>            <span class="k">for</span> <span class="n">output_path</span> <span class="ow">in</span> <span class="n">output_paths</span><span class="p">:</span>
+</span><span id="__span-0-661"><a id="__codelineno-0-661" name="__codelineno-0-661"></a>                <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">output_path</span><span class="p">,</span> <span class="s2">&quot;rb&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+</span><span id="__span-0-662"><a id="__codelineno-0-662" name="__codelineno-0-662"></a>                    <span class="n">parquet_file</span> <span class="o">=</span> <span class="n">pq</span><span class="o">.</span><span class="n">ParquetFile</span><span class="p">(</span>
+</span><span id="__span-0-663"><a id="__codelineno-0-663" name="__codelineno-0-663"></a>                        <span class="n">f</span><span class="p">,</span>
+</span><span id="__span-0-664"><a id="__codelineno-0-664" name="__codelineno-0-664"></a>                        <span class="c1"># increased to avoid OSErrors</span>
+</span><span id="__span-0-665"><a id="__codelineno-0-665" name="__codelineno-0-665"></a>                        <span class="n">thrift_string_size_limit</span><span class="o">=</span><span class="mi">1000000000</span><span class="p">,</span>  <span class="c1"># default: 100000000</span>
+</span><span id="__span-0-666"><a id="__codelineno-0-666" name="__codelineno-0-666"></a>                        <span class="n">thrift_container_size_limit</span><span class="o">=</span><span class="mi">10000000</span><span class="p">,</span>  <span class="c1"># default: 1000000</span>
+</span><span id="__span-0-667"><a id="__codelineno-0-667" name="__codelineno-0-667"></a>                    <span class="p">)</span>
+</span><span id="__span-0-668"><a id="__codelineno-0-668" name="__codelineno-0-668"></a>                    <span class="n">rows_count</span> <span class="o">+=</span> <span class="n">parquet_file</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">num_rows</span>
+</span><span id="__span-0-669"><a id="__codelineno-0-669" name="__codelineno-0-669"></a>
+</span><span id="__span-0-670"><a id="__codelineno-0-670" name="__codelineno-0-670"></a>                    <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s2">&quot;Rows = </span><span class="si">%s</span><span class="s2"> in </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">rows_count</span><span class="p">,</span> <span class="n">output_path</span><span class="p">)</span>
+</span><span id="__span-0-671"><a id="__codelineno-0-671" name="__codelineno-0-671"></a>
+</span><span id="__span-0-672"><a id="__codelineno-0-672" name="__codelineno-0-672"></a>            <span class="k">return</span> <span class="n">rows_count</span>
+</span><span id="__span-0-673"><a id="__codelineno-0-673" name="__codelineno-0-673"></a>
+</span><span id="__span-0-674"><a id="__codelineno-0-674" name="__codelineno-0-674"></a>        <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s2">&quot;No output files exists for </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span><span class="p">)</span>
+</span><span id="__span-0-675"><a id="__codelineno-0-675" name="__codelineno-0-675"></a>        <span class="k">return</span> <span class="o">-</span><span class="mi">1</span>
+</span><span id="__span-0-676"><a id="__codelineno-0-676" name="__codelineno-0-676"></a>    <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-677"><a id="__codelineno-0-677" name="__codelineno-0-677"></a>        <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Cannot determine the output rows count with </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">output_format</span><span class="si">=}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.base.BaseDataset.get_sampling_factor" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">get_sampling_factor</span><span class="p">()</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Sampling is defined based on dataset ID, source ID, or language.</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/base.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-949">949</a></span>
+<span class="normal"><a href="#__codelineno-0-950">950</a></span>
+<span class="normal"><a href="#__codelineno-0-951">951</a></span>
+<span class="normal"><a href="#__codelineno-0-952">952</a></span>
+<span class="normal"><a href="#__codelineno-0-953">953</a></span>
+<span class="normal"><a href="#__codelineno-0-954">954</a></span>
+<span class="normal"><a href="#__codelineno-0-955">955</a></span>
+<span class="normal"><a href="#__codelineno-0-956">956</a></span>
+<span class="normal"><a href="#__codelineno-0-957">957</a></span>
+<span class="normal"><a href="#__codelineno-0-958">958</a></span>
+<span class="normal"><a href="#__codelineno-0-959">959</a></span>
+<span class="normal"><a href="#__codelineno-0-960">960</a></span>
+<span class="normal"><a href="#__codelineno-0-961">961</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-949"><a id="__codelineno-0-949" name="__codelineno-0-949"></a><span class="k">def</span> <span class="nf">get_sampling_factor</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
+</span><span id="__span-0-950"><a id="__codelineno-0-950" name="__codelineno-0-950"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Sampling is defined based on dataset ID, source ID, or language.&quot;&quot;&quot;</span>
+</span><span id="__span-0-951"><a id="__codelineno-0-951" name="__codelineno-0-951"></a>    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="p">:</span>
+</span><span id="__span-0-952"><a id="__codelineno-0-952" name="__codelineno-0-952"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">sampling_factor_by_dataset_id</span><span class="p">:</span>
+</span><span id="__span-0-953"><a id="__codelineno-0-953" name="__codelineno-0-953"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">sampling_factor_by_dataset_id</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span><span class="p">]</span>
+</span><span id="__span-0-954"><a id="__codelineno-0-954" name="__codelineno-0-954"></a>
+</span><span id="__span-0-955"><a id="__codelineno-0-955" name="__codelineno-0-955"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_source_id</span><span class="p">()</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">sampling_factor_by_source_id</span><span class="p">:</span>
+</span><span id="__span-0-956"><a id="__codelineno-0-956" name="__codelineno-0-956"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">sampling_factor_by_source_id</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">get_source_id</span><span class="p">()]</span>
+</span><span id="__span-0-957"><a id="__codelineno-0-957" name="__codelineno-0-957"></a>
+</span><span id="__span-0-958"><a id="__codelineno-0-958" name="__codelineno-0-958"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_language_code</span><span class="p">()</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">sampling_factor_by_language</span><span class="p">:</span>
+</span><span id="__span-0-959"><a id="__codelineno-0-959" name="__codelineno-0-959"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">sampling_factor_by_language</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">get_language_code</span><span class="p">()]</span>
+</span><span id="__span-0-960"><a id="__codelineno-0-960" name="__codelineno-0-960"></a>
+</span><span id="__span-0-961"><a id="__codelineno-0-961" name="__codelineno-0-961"></a>    <span class="k">return</span> <span class="mf">1.0</span>  <span class="c1"># default factor</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.base.BaseDataset.is_selected" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">is_selected</span><span class="p">()</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Is this dataset part of selected datasets or sources?</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/base.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-963">963</a></span>
+<span class="normal"><a href="#__codelineno-0-964">964</a></span>
+<span class="normal"><a href="#__codelineno-0-965">965</a></span>
+<span class="normal"><a href="#__codelineno-0-966">966</a></span>
+<span class="normal"><a href="#__codelineno-0-967">967</a></span>
+<span class="normal"><a href="#__codelineno-0-968">968</a></span>
+<span class="normal"><a href="#__codelineno-0-969">969</a></span>
+<span class="normal"><a href="#__codelineno-0-970">970</a></span>
+<span class="normal"><a href="#__codelineno-0-971">971</a></span>
+<span class="normal"><a href="#__codelineno-0-972">972</a></span>
+<span class="normal"><a href="#__codelineno-0-973">973</a></span>
+<span class="normal"><a href="#__codelineno-0-974">974</a></span>
+<span class="normal"><a href="#__codelineno-0-975">975</a></span>
+<span class="normal"><a href="#__codelineno-0-976">976</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-963"><a id="__codelineno-0-963" name="__codelineno-0-963"></a><span class="k">def</span> <span class="nf">is_selected</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
+</span><span id="__span-0-964"><a id="__codelineno-0-964" name="__codelineno-0-964"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Is this dataset part of selected datasets or sources?&quot;&quot;&quot;</span>
+</span><span id="__span-0-965"><a id="__codelineno-0-965" name="__codelineno-0-965"></a>    <span class="k">if</span> <span class="p">(</span>
+</span><span id="__span-0-966"><a id="__codelineno-0-966" name="__codelineno-0-966"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">selected_dataset_ids</span>
+</span><span id="__span-0-967"><a id="__codelineno-0-967" name="__codelineno-0-967"></a>        <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_source_id</span><span class="p">()</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">selected_source_ids</span>
+</span><span id="__span-0-968"><a id="__codelineno-0-968" name="__codelineno-0-968"></a>    <span class="p">):</span>
+</span><span id="__span-0-969"><a id="__codelineno-0-969" name="__codelineno-0-969"></a>        <span class="k">return</span> <span class="kc">True</span>
+</span><span id="__span-0-970"><a id="__codelineno-0-970" name="__codelineno-0-970"></a>    <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-971"><a id="__codelineno-0-971" name="__codelineno-0-971"></a>        <span class="c1"># try fnmatch</span>
+</span><span id="__span-0-972"><a id="__codelineno-0-972" name="__codelineno-0-972"></a>        <span class="k">for</span> <span class="n">pattern</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">get_selected_dataset_ids</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">&quot;fnmatch&quot;</span><span class="p">):</span>
+</span><span id="__span-0-973"><a id="__codelineno-0-973" name="__codelineno-0-973"></a>            <span class="k">if</span> <span class="n">fnmatch</span><span class="o">.</span><span class="n">fnmatch</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">DATASET_ID</span><span class="p">,</span> <span class="n">pattern</span><span class="p">):</span>
+</span><span id="__span-0-974"><a id="__codelineno-0-974" name="__codelineno-0-974"></a>                <span class="k">return</span> <span class="kc">True</span>
+</span><span id="__span-0-975"><a id="__codelineno-0-975" name="__codelineno-0-975"></a>
+</span><span id="__span-0-976"><a id="__codelineno-0-976" name="__codelineno-0-976"></a>        <span class="k">return</span> <span class="kc">False</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.base.BaseDataset.save_stats" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">save_stats</span><span class="p">()</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Save the processing statistics (counter) into a JSON file in the output directory.</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/base.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-985"> 985</a></span>
+<span class="normal"><a href="#__codelineno-0-986"> 986</a></span>
+<span class="normal"><a href="#__codelineno-0-987"> 987</a></span>
+<span class="normal"><a href="#__codelineno-0-988"> 988</a></span>
+<span class="normal"><a href="#__codelineno-0-989"> 989</a></span>
+<span class="normal"><a href="#__codelineno-0-990"> 990</a></span>
+<span class="normal"><a href="#__codelineno-0-991"> 991</a></span>
+<span class="normal"><a href="#__codelineno-0-992"> 992</a></span>
+<span class="normal"><a href="#__codelineno-0-993"> 993</a></span>
+<span class="normal"><a href="#__codelineno-0-994"> 994</a></span>
+<span class="normal"><a href="#__codelineno-0-995"> 995</a></span>
+<span class="normal"><a href="#__codelineno-0-996"> 996</a></span>
+<span class="normal"><a href="#__codelineno-0-997"> 997</a></span>
+<span class="normal"><a href="#__codelineno-0-998"> 998</a></span>
+<span class="normal"><a href="#__codelineno-0-999"> 999</a></span>
+<span class="normal"><a href="#__codelineno-0-1000">1000</a></span>
+<span class="normal"><a href="#__codelineno-0-1001">1001</a></span>
+<span class="normal"><a href="#__codelineno-0-1002">1002</a></span>
+<span class="normal"><a href="#__codelineno-0-1003">1003</a></span>
+<span class="normal"><a href="#__codelineno-0-1004">1004</a></span>
+<span class="normal"><a href="#__codelineno-0-1005">1005</a></span>
+<span class="normal"><a href="#__codelineno-0-1006">1006</a></span>
+<span class="normal"><a href="#__codelineno-0-1007">1007</a></span>
+<span class="normal"><a href="#__codelineno-0-1008">1008</a></span>
+<span class="normal"><a href="#__codelineno-0-1009">1009</a></span>
+<span class="normal"><a href="#__codelineno-0-1010">1010</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-985"><a id="__codelineno-0-985" name="__codelineno-0-985"></a><span class="k">def</span> <span class="nf">save_stats</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-986"><a id="__codelineno-0-986" name="__codelineno-0-986"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Save the processing statistics (counter) into a JSON file in the output directory.&quot;&quot;&quot;</span>
+</span><span id="__span-0-987"><a id="__codelineno-0-987" name="__codelineno-0-987"></a>    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">counter</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-988"><a id="__codelineno-0-988" name="__codelineno-0-988"></a>        <span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="s2">&quot;Cannot save statistics because none were recorded.&quot;</span><span class="p">)</span>
+</span><span id="__span-0-989"><a id="__codelineno-0-989" name="__codelineno-0-989"></a>        <span class="k">return</span>
+</span><span id="__span-0-990"><a id="__codelineno-0-990" name="__codelineno-0-990"></a>
+</span><span id="__span-0-991"><a id="__codelineno-0-991" name="__codelineno-0-991"></a>    <span class="n">date_format</span> <span class="o">=</span> <span class="s2">&quot;%Y-%m-</span><span class="si">%d</span><span class="s2">_%H%M%S&quot;</span>
+</span><span id="__span-0-992"><a id="__codelineno-0-992" name="__codelineno-0-992"></a>    <span class="bp">self</span><span class="o">.</span><span class="n">end_time</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span>
+</span><span id="__span-0-993"><a id="__codelineno-0-993" name="__codelineno-0-993"></a>    <span class="n">short_uuid</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">uuid</span><span class="o">.</span><span class="n">uuid4</span><span class="p">())[:</span><span class="mi">5</span><span class="p">]</span>
+</span><span id="__span-0-994"><a id="__codelineno-0-994" name="__codelineno-0-994"></a>    <span class="n">stats_file_name</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;stats_</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">end_time</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">date_format</span><span class="p">)</span><span class="si">}</span><span class="s2">_</span><span class="si">{</span><span class="n">short_uuid</span><span class="si">}</span><span class="s2">.</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">get_job_id</span><span class="p">()</span><span class="si">}</span><span class="s2">.json&quot;</span>
+</span><span id="__span-0-995"><a id="__codelineno-0-995" name="__codelineno-0-995"></a>    <span class="n">stats_file_path</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_dir</span><span class="p">(),</span> <span class="n">stats_file_name</span><span class="p">)</span>
+</span><span id="__span-0-996"><a id="__codelineno-0-996" name="__codelineno-0-996"></a>
+</span><span id="__span-0-997"><a id="__codelineno-0-997" name="__codelineno-0-997"></a>    <span class="n">stats</span> <span class="o">=</span> <span class="p">{</span>
+</span><span id="__span-0-998"><a id="__codelineno-0-998" name="__codelineno-0-998"></a>        <span class="s2">&quot;counter&quot;</span><span class="p">:</span> <span class="nb">dict</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">counter</span><span class="p">),</span>
+</span><span id="__span-0-999"><a id="__codelineno-0-999" name="__codelineno-0-999"></a>        <span class="s2">&quot;start_time&quot;</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">start_time</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">date_format</span><span class="p">),</span>
+</span><span id="__span-0-1000"><a id="__codelineno-0-1000" name="__codelineno-0-1000"></a>        <span class="s2">&quot;end_time&quot;</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">end_time</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">date_format</span><span class="p">),</span>
+</span><span id="__span-0-1001"><a id="__codelineno-0-1001" name="__codelineno-0-1001"></a>        <span class="s2">&quot;job_id&quot;</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">get_job_id</span><span class="p">(),</span>
+</span><span id="__span-0-1002"><a id="__codelineno-0-1002" name="__codelineno-0-1002"></a>        <span class="c1"># &quot;config&quot;: self.config,</span>
+</span><span id="__span-0-1003"><a id="__codelineno-0-1003" name="__codelineno-0-1003"></a>    <span class="p">}</span>
+</span><span id="__span-0-1004"><a id="__codelineno-0-1004" name="__codelineno-0-1004"></a>
+</span><span id="__span-0-1005"><a id="__codelineno-0-1005" name="__codelineno-0-1005"></a>    <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">stats_file_path</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+</span><span id="__span-0-1006"><a id="__codelineno-0-1006" name="__codelineno-0-1006"></a>        <span class="n">json</span><span class="o">.</span><span class="n">dump</span><span class="p">(</span><span class="n">stats</span><span class="p">,</span> <span class="n">f</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
+</span><span id="__span-0-1007"><a id="__codelineno-0-1007" name="__codelineno-0-1007"></a>
+</span><span id="__span-0-1008"><a id="__codelineno-0-1008" name="__codelineno-0-1008"></a>    <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Statistics saved to </span><span class="si">{</span><span class="n">stats_file_path</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-1009"><a id="__codelineno-0-1009" name="__codelineno-0-1009"></a>
+</span><span id="__span-0-1010"><a id="__codelineno-0-1010" name="__codelineno-0-1010"></a>    <span class="k">return</span> <span class="n">stats_file_path</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.base.BaseDataset.save_texts" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">save_texts</span><span class="p">(</span><span class="n">texts</span><span class="p">,</span> <span class="n">append</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Save texts in different formats</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/base.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-364">364</a></span>
+<span class="normal"><a href="#__codelineno-0-365">365</a></span>
+<span class="normal"><a href="#__codelineno-0-366">366</a></span>
+<span class="normal"><a href="#__codelineno-0-367">367</a></span>
+<span class="normal"><a href="#__codelineno-0-368">368</a></span>
+<span class="normal"><a href="#__codelineno-0-369">369</a></span>
+<span class="normal"><a href="#__codelineno-0-370">370</a></span>
+<span class="normal"><a href="#__codelineno-0-371">371</a></span>
+<span class="normal"><a href="#__codelineno-0-372">372</a></span>
+<span class="normal"><a href="#__codelineno-0-373">373</a></span>
+<span class="normal"><a href="#__codelineno-0-374">374</a></span>
+<span class="normal"><a href="#__codelineno-0-375">375</a></span>
+<span class="normal"><a href="#__codelineno-0-376">376</a></span>
+<span class="normal"><a href="#__codelineno-0-377">377</a></span>
+<span class="normal"><a href="#__codelineno-0-378">378</a></span>
+<span class="normal"><a href="#__codelineno-0-379">379</a></span>
+<span class="normal"><a href="#__codelineno-0-380">380</a></span>
+<span class="normal"><a href="#__codelineno-0-381">381</a></span>
+<span class="normal"><a href="#__codelineno-0-382">382</a></span>
+<span class="normal"><a href="#__codelineno-0-383">383</a></span>
+<span class="normal"><a href="#__codelineno-0-384">384</a></span>
+<span class="normal"><a href="#__codelineno-0-385">385</a></span>
+<span class="normal"><a href="#__codelineno-0-386">386</a></span>
+<span class="normal"><a href="#__codelineno-0-387">387</a></span>
+<span class="normal"><a href="#__codelineno-0-388">388</a></span>
+<span class="normal"><a href="#__codelineno-0-389">389</a></span>
+<span class="normal"><a href="#__codelineno-0-390">390</a></span>
+<span class="normal"><a href="#__codelineno-0-391">391</a></span>
+<span class="normal"><a href="#__codelineno-0-392">392</a></span>
+<span class="normal"><a href="#__codelineno-0-393">393</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-364"><a id="__codelineno-0-364" name="__codelineno-0-364"></a><span class="k">def</span> <span class="nf">save_texts</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">texts</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">append</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">):</span>
+</span><span id="__span-0-365"><a id="__codelineno-0-365" name="__codelineno-0-365"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Save texts in different formats&quot;&quot;&quot;</span>
+</span><span id="__span-0-366"><a id="__codelineno-0-366" name="__codelineno-0-366"></a>    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">has_output_files</span><span class="p">()</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">override_output</span><span class="p">:</span>
+</span><span id="__span-0-367"><a id="__codelineno-0-367" name="__codelineno-0-367"></a>        <span class="k">raise</span> <span class="ne">FileExistsError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Output exists already (override not enabled): </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">()</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-368"><a id="__codelineno-0-368" name="__codelineno-0-368"></a>
+</span><span id="__span-0-369"><a id="__codelineno-0-369" name="__codelineno-0-369"></a>    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">==</span> <span class="s2">&quot;jsonl&quot;</span><span class="p">:</span>
+</span><span id="__span-0-370"><a id="__codelineno-0-370" name="__codelineno-0-370"></a>        <span class="n">docs_count</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">save_texts_to_jsonl</span><span class="p">(</span><span class="n">texts</span><span class="p">,</span> <span class="n">append</span><span class="o">=</span><span class="n">append</span><span class="p">)</span>
+</span><span id="__span-0-371"><a id="__codelineno-0-371" name="__codelineno-0-371"></a>
+</span><span id="__span-0-372"><a id="__codelineno-0-372" name="__codelineno-0-372"></a>    <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">==</span> <span class="s2">&quot;parquet&quot;</span><span class="p">:</span>
+</span><span id="__span-0-373"><a id="__codelineno-0-373" name="__codelineno-0-373"></a>        <span class="k">if</span> <span class="n">append</span><span class="p">:</span>
+</span><span id="__span-0-374"><a id="__codelineno-0-374" name="__codelineno-0-374"></a>            <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;Appending is not supported by parquet output format&quot;</span><span class="p">)</span>
+</span><span id="__span-0-375"><a id="__codelineno-0-375" name="__codelineno-0-375"></a>
+</span><span id="__span-0-376"><a id="__codelineno-0-376" name="__codelineno-0-376"></a>        <span class="n">docs_count</span><span class="p">,</span> <span class="n">saved_chunks</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">save_texts_to_parquet</span><span class="p">(</span><span class="n">texts</span><span class="p">)</span>
+</span><span id="__span-0-377"><a id="__codelineno-0-377" name="__codelineno-0-377"></a>
+</span><span id="__span-0-378"><a id="__codelineno-0-378" name="__codelineno-0-378"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">counter</span><span class="o">.</span><span class="n">update</span><span class="p">({</span><span class="s2">&quot;saved_chunks&quot;</span><span class="p">:</span> <span class="n">saved_chunks</span><span class="p">})</span>
+</span><span id="__span-0-379"><a id="__codelineno-0-379" name="__codelineno-0-379"></a>    <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-380"><a id="__codelineno-0-380" name="__codelineno-0-380"></a>        <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Unsupported output format: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">output_format</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-381"><a id="__codelineno-0-381" name="__codelineno-0-381"></a>
+</span><span id="__span-0-382"><a id="__codelineno-0-382" name="__codelineno-0-382"></a>    <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Documents saved: </span><span class="si">{</span><span class="n">docs_count</span><span class="si">:</span><span class="s2">,</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-383"><a id="__codelineno-0-383" name="__codelineno-0-383"></a>
+</span><span id="__span-0-384"><a id="__codelineno-0-384" name="__codelineno-0-384"></a>    <span class="bp">self</span><span class="o">.</span><span class="n">counter</span><span class="o">.</span><span class="n">update</span><span class="p">({</span><span class="s2">&quot;docs_count&quot;</span><span class="p">:</span> <span class="n">docs_count</span><span class="p">})</span>
+</span><span id="__span-0-385"><a id="__codelineno-0-385" name="__codelineno-0-385"></a>
+</span><span id="__span-0-386"><a id="__codelineno-0-386" name="__codelineno-0-386"></a>    <span class="k">if</span> <span class="n">docs_count</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+</span><span id="__span-0-387"><a id="__codelineno-0-387" name="__codelineno-0-387"></a>        <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">&quot;No documents have been saved!&quot;</span><span class="p">)</span>
+</span><span id="__span-0-388"><a id="__codelineno-0-388" name="__codelineno-0-388"></a>
+</span><span id="__span-0-389"><a id="__codelineno-0-389" name="__codelineno-0-389"></a>        <span class="c1"># delete empty output file</span>
+</span><span id="__span-0-390"><a id="__codelineno-0-390" name="__codelineno-0-390"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">has_output_files</span><span class="p">():</span>
+</span><span id="__span-0-391"><a id="__codelineno-0-391" name="__codelineno-0-391"></a>            <span class="bp">self</span><span class="o">.</span><span class="n">remove_texts</span><span class="p">()</span>
+</span><span id="__span-0-392"><a id="__codelineno-0-392" name="__codelineno-0-392"></a>
+</span><span id="__span-0-393"><a id="__codelineno-0-393" name="__codelineno-0-393"></a>    <span class="k">return</span> <span class="n">docs_count</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.base.BaseDataset.save_texts_to_jsonl" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">save_texts_to_jsonl</span><span class="p">(</span><span class="n">texts</span><span class="p">,</span> <span class="n">append</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Write JSONL files to <output_dir>/<DATASET_ID>.jsonl
+(each line is a JSON object with "doc" field and text as plain text)</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/base.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-445">445</a></span>
+<span class="normal"><a href="#__codelineno-0-446">446</a></span>
+<span class="normal"><a href="#__codelineno-0-447">447</a></span>
+<span class="normal"><a href="#__codelineno-0-448">448</a></span>
+<span class="normal"><a href="#__codelineno-0-449">449</a></span>
+<span class="normal"><a href="#__codelineno-0-450">450</a></span>
+<span class="normal"><a href="#__codelineno-0-451">451</a></span>
+<span class="normal"><a href="#__codelineno-0-452">452</a></span>
+<span class="normal"><a href="#__codelineno-0-453">453</a></span>
+<span class="normal"><a href="#__codelineno-0-454">454</a></span>
+<span class="normal"><a href="#__codelineno-0-455">455</a></span>
+<span class="normal"><a href="#__codelineno-0-456">456</a></span>
+<span class="normal"><a href="#__codelineno-0-457">457</a></span>
+<span class="normal"><a href="#__codelineno-0-458">458</a></span>
+<span class="normal"><a href="#__codelineno-0-459">459</a></span>
+<span class="normal"><a href="#__codelineno-0-460">460</a></span>
+<span class="normal"><a href="#__codelineno-0-461">461</a></span>
+<span class="normal"><a href="#__codelineno-0-462">462</a></span>
+<span class="normal"><a href="#__codelineno-0-463">463</a></span>
+<span class="normal"><a href="#__codelineno-0-464">464</a></span>
+<span class="normal"><a href="#__codelineno-0-465">465</a></span>
+<span class="normal"><a href="#__codelineno-0-466">466</a></span>
+<span class="normal"><a href="#__codelineno-0-467">467</a></span>
+<span class="normal"><a href="#__codelineno-0-468">468</a></span>
+<span class="normal"><a href="#__codelineno-0-469">469</a></span>
+<span class="normal"><a href="#__codelineno-0-470">470</a></span>
+<span class="normal"><a href="#__codelineno-0-471">471</a></span>
+<span class="normal"><a href="#__codelineno-0-472">472</a></span>
+<span class="normal"><a href="#__codelineno-0-473">473</a></span>
+<span class="normal"><a href="#__codelineno-0-474">474</a></span>
+<span class="normal"><a href="#__codelineno-0-475">475</a></span>
+<span class="normal"><a href="#__codelineno-0-476">476</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-445"><a id="__codelineno-0-445" name="__codelineno-0-445"></a><span class="k">def</span> <span class="nf">save_texts_to_jsonl</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">texts</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">append</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">):</span>
+</span><span id="__span-0-446"><a id="__codelineno-0-446" name="__codelineno-0-446"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Write JSONL files to &lt;output_dir&gt;/&lt;DATASET_ID&gt;.jsonl</span>
+</span><span id="__span-0-447"><a id="__codelineno-0-447" name="__codelineno-0-447"></a><span class="sd">    (each line is a JSON object with &quot;doc&quot; field and text as plain text)</span>
+</span><span id="__span-0-448"><a id="__codelineno-0-448" name="__codelineno-0-448"></a><span class="sd">    &quot;&quot;&quot;</span>
+</span><span id="__span-0-449"><a id="__codelineno-0-449" name="__codelineno-0-449"></a>    <span class="n">mode</span> <span class="o">=</span> <span class="s2">&quot;a&quot;</span> <span class="k">if</span> <span class="n">append</span> <span class="k">else</span> <span class="s2">&quot;w&quot;</span>
+</span><span id="__span-0-450"><a id="__codelineno-0-450" name="__codelineno-0-450"></a>    <span class="n">fp</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">(</span><span class="n">single</span><span class="o">=</span><span class="kc">True</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
+</span><span id="__span-0-451"><a id="__codelineno-0-451" name="__codelineno-0-451"></a>
+</span><span id="__span-0-452"><a id="__codelineno-0-452" name="__codelineno-0-452"></a>    <span class="c1"># Save as JSONL</span>
+</span><span id="__span-0-453"><a id="__codelineno-0-453" name="__codelineno-0-453"></a>    <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Writing JSONL output to </span><span class="si">{</span><span class="n">fp</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="n">mode</span><span class="si">=}</span><span class="s2">)&quot;</span><span class="p">)</span>
+</span><span id="__span-0-454"><a id="__codelineno-0-454" name="__codelineno-0-454"></a>
+</span><span id="__span-0-455"><a id="__codelineno-0-455" name="__codelineno-0-455"></a>    <span class="n">docs_count</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-456"><a id="__codelineno-0-456" name="__codelineno-0-456"></a>
+</span><span id="__span-0-457"><a id="__codelineno-0-457" name="__codelineno-0-457"></a>    <span class="k">with</span> <span class="n">smart_open</span><span class="p">(</span><span class="n">fp</span><span class="p">,</span> <span class="n">mode</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+</span><span id="__span-0-458"><a id="__codelineno-0-458" name="__codelineno-0-458"></a>        <span class="k">for</span> <span class="n">docs_count</span><span class="p">,</span> <span class="n">text</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">filter_texts</span><span class="p">(</span><span class="n">texts</span><span class="p">),</span> <span class="mi">1</span><span class="p">):</span>
+</span><span id="__span-0-459"><a id="__codelineno-0-459" name="__codelineno-0-459"></a>            <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">({</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_text_field</span><span class="p">():</span> <span class="n">text</span><span class="p">},</span> <span class="n">ensure_ascii</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">json_ensure_ascii</span><span class="p">)</span> <span class="o">+</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-460"><a id="__codelineno-0-460" name="__codelineno-0-460"></a>
+</span><span id="__span-0-461"><a id="__codelineno-0-461" name="__codelineno-0-461"></a>            <span class="k">if</span> <span class="n">docs_count</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="p">(</span><span class="n">docs_count</span> <span class="o">%</span> <span class="bp">self</span><span class="o">.</span><span class="n">print_write_progress</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+</span><span id="__span-0-462"><a id="__codelineno-0-462" name="__codelineno-0-462"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Written </span><span class="si">{</span><span class="n">docs_count</span><span class="si">:</span><span class="s2">,</span><span class="si">}</span><span class="s2"> docs ...&quot;</span><span class="p">)</span>
+</span><span id="__span-0-463"><a id="__codelineno-0-463" name="__codelineno-0-463"></a>
+</span><span id="__span-0-464"><a id="__codelineno-0-464" name="__codelineno-0-464"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">limit</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="n">docs_count</span> <span class="o">&gt;=</span> <span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">:</span>
+</span><span id="__span-0-465"><a id="__codelineno-0-465" name="__codelineno-0-465"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Limit reached (</span><span class="si">{</span><span class="n">docs_count</span><span class="si">:</span><span class="s2">,</span><span class="si">}</span><span class="s2"> docs)&quot;</span><span class="p">)</span>
+</span><span id="__span-0-466"><a id="__codelineno-0-466" name="__codelineno-0-466"></a>
+</span><span id="__span-0-467"><a id="__codelineno-0-467" name="__codelineno-0-467"></a>                <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">texts</span><span class="p">,</span> <span class="s2">&quot;terminate&quot;</span><span class="p">):</span>
+</span><span id="__span-0-468"><a id="__codelineno-0-468" name="__codelineno-0-468"></a>                    <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Killing all remaining workers, if any&quot;</span><span class="p">)</span>
+</span><span id="__span-0-469"><a id="__codelineno-0-469" name="__codelineno-0-469"></a>                    <span class="n">texts</span><span class="o">.</span><span class="n">terminate</span><span class="p">()</span>
+</span><span id="__span-0-470"><a id="__codelineno-0-470" name="__codelineno-0-470"></a>                <span class="k">break</span>
+</span><span id="__span-0-471"><a id="__codelineno-0-471" name="__codelineno-0-471"></a>
+</span><span id="__span-0-472"><a id="__codelineno-0-472" name="__codelineno-0-472"></a>    <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">texts</span><span class="p">,</span> <span class="s2">&quot;terminate&quot;</span><span class="p">):</span>
+</span><span id="__span-0-473"><a id="__codelineno-0-473" name="__codelineno-0-473"></a>        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Killing all remaining workers, if any (iterator end)&quot;</span><span class="p">)</span>
+</span><span id="__span-0-474"><a id="__codelineno-0-474" name="__codelineno-0-474"></a>        <span class="n">texts</span><span class="o">.</span><span class="n">terminate</span><span class="p">()</span>
+</span><span id="__span-0-475"><a id="__codelineno-0-475" name="__codelineno-0-475"></a>
+</span><span id="__span-0-476"><a id="__codelineno-0-476" name="__codelineno-0-476"></a>    <span class="k">return</span> <span class="n">docs_count</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.base.BaseDataset.save_texts_to_parquet" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">save_texts_to_parquet</span><span class="p">(</span><span class="n">texts</span><span class="p">,</span> <span class="n">file_path</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">apply_filter</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Save text in parquet (single column schema, in batches)</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/base.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-395">395</a></span>
+<span class="normal"><a href="#__codelineno-0-396">396</a></span>
+<span class="normal"><a href="#__codelineno-0-397">397</a></span>
+<span class="normal"><a href="#__codelineno-0-398">398</a></span>
+<span class="normal"><a href="#__codelineno-0-399">399</a></span>
+<span class="normal"><a href="#__codelineno-0-400">400</a></span>
+<span class="normal"><a href="#__codelineno-0-401">401</a></span>
+<span class="normal"><a href="#__codelineno-0-402">402</a></span>
+<span class="normal"><a href="#__codelineno-0-403">403</a></span>
+<span class="normal"><a href="#__codelineno-0-404">404</a></span>
+<span class="normal"><a href="#__codelineno-0-405">405</a></span>
+<span class="normal"><a href="#__codelineno-0-406">406</a></span>
+<span class="normal"><a href="#__codelineno-0-407">407</a></span>
+<span class="normal"><a href="#__codelineno-0-408">408</a></span>
+<span class="normal"><a href="#__codelineno-0-409">409</a></span>
+<span class="normal"><a href="#__codelineno-0-410">410</a></span>
+<span class="normal"><a href="#__codelineno-0-411">411</a></span>
+<span class="normal"><a href="#__codelineno-0-412">412</a></span>
+<span class="normal"><a href="#__codelineno-0-413">413</a></span>
+<span class="normal"><a href="#__codelineno-0-414">414</a></span>
+<span class="normal"><a href="#__codelineno-0-415">415</a></span>
+<span class="normal"><a href="#__codelineno-0-416">416</a></span>
+<span class="normal"><a href="#__codelineno-0-417">417</a></span>
+<span class="normal"><a href="#__codelineno-0-418">418</a></span>
+<span class="normal"><a href="#__codelineno-0-419">419</a></span>
+<span class="normal"><a href="#__codelineno-0-420">420</a></span>
+<span class="normal"><a href="#__codelineno-0-421">421</a></span>
+<span class="normal"><a href="#__codelineno-0-422">422</a></span>
+<span class="normal"><a href="#__codelineno-0-423">423</a></span>
+<span class="normal"><a href="#__codelineno-0-424">424</a></span>
+<span class="normal"><a href="#__codelineno-0-425">425</a></span>
+<span class="normal"><a href="#__codelineno-0-426">426</a></span>
+<span class="normal"><a href="#__codelineno-0-427">427</a></span>
+<span class="normal"><a href="#__codelineno-0-428">428</a></span>
+<span class="normal"><a href="#__codelineno-0-429">429</a></span>
+<span class="normal"><a href="#__codelineno-0-430">430</a></span>
+<span class="normal"><a href="#__codelineno-0-431">431</a></span>
+<span class="normal"><a href="#__codelineno-0-432">432</a></span>
+<span class="normal"><a href="#__codelineno-0-433">433</a></span>
+<span class="normal"><a href="#__codelineno-0-434">434</a></span>
+<span class="normal"><a href="#__codelineno-0-435">435</a></span>
+<span class="normal"><a href="#__codelineno-0-436">436</a></span>
+<span class="normal"><a href="#__codelineno-0-437">437</a></span>
+<span class="normal"><a href="#__codelineno-0-438">438</a></span>
+<span class="normal"><a href="#__codelineno-0-439">439</a></span>
+<span class="normal"><a href="#__codelineno-0-440">440</a></span>
+<span class="normal"><a href="#__codelineno-0-441">441</a></span>
+<span class="normal"><a href="#__codelineno-0-442">442</a></span>
+<span class="normal"><a href="#__codelineno-0-443">443</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-395"><a id="__codelineno-0-395" name="__codelineno-0-395"></a><span class="k">def</span> <span class="nf">save_texts_to_parquet</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">texts</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">file_path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">apply_filter</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">):</span>
+</span><span id="__span-0-396"><a id="__codelineno-0-396" name="__codelineno-0-396"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Save text in parquet (single column schema, in batches)&quot;&quot;&quot;</span>
+</span><span id="__span-0-397"><a id="__codelineno-0-397" name="__codelineno-0-397"></a>    <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">output_format</span> <span class="o">==</span> <span class="s2">&quot;parquet&quot;</span>
+</span><span id="__span-0-398"><a id="__codelineno-0-398" name="__codelineno-0-398"></a>
+</span><span id="__span-0-399"><a id="__codelineno-0-399" name="__codelineno-0-399"></a>    <span class="k">if</span> <span class="n">file_path</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-400"><a id="__codelineno-0-400" name="__codelineno-0-400"></a>        <span class="n">file_path</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_output_file_paths</span><span class="p">(</span><span class="n">single</span><span class="o">=</span><span class="kc">True</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
+</span><span id="__span-0-401"><a id="__codelineno-0-401" name="__codelineno-0-401"></a>
+</span><span id="__span-0-402"><a id="__codelineno-0-402" name="__codelineno-0-402"></a>    <span class="k">if</span> <span class="n">apply_filter</span><span class="p">:</span>
+</span><span id="__span-0-403"><a id="__codelineno-0-403" name="__codelineno-0-403"></a>        <span class="n">texts</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">filter_texts_or_documents</span><span class="p">(</span><span class="n">texts</span><span class="p">)</span>
+</span><span id="__span-0-404"><a id="__codelineno-0-404" name="__codelineno-0-404"></a>
+</span><span id="__span-0-405"><a id="__codelineno-0-405" name="__codelineno-0-405"></a>    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">use_documents</span><span class="p">:</span>
+</span><span id="__span-0-406"><a id="__codelineno-0-406" name="__codelineno-0-406"></a>        <span class="c1"># document schema</span>
+</span><span id="__span-0-407"><a id="__codelineno-0-407" name="__codelineno-0-407"></a>        <span class="n">schema</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_document_schema</span><span class="p">()</span><span class="o">.</span><span class="n">get_pa_schema</span><span class="p">()</span>
+</span><span id="__span-0-408"><a id="__codelineno-0-408" name="__codelineno-0-408"></a>    <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-409"><a id="__codelineno-0-409" name="__codelineno-0-409"></a>        <span class="c1"># text-only schema</span>
+</span><span id="__span-0-410"><a id="__codelineno-0-410" name="__codelineno-0-410"></a>        <span class="n">schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">(</span>
+</span><span id="__span-0-411"><a id="__codelineno-0-411" name="__codelineno-0-411"></a>            <span class="p">[</span>
+</span><span id="__span-0-412"><a id="__codelineno-0-412" name="__codelineno-0-412"></a>                <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_output_text_field</span><span class="p">(),</span> <span class="n">pa</span><span class="o">.</span><span class="n">string</span><span class="p">()),</span>
+</span><span id="__span-0-413"><a id="__codelineno-0-413" name="__codelineno-0-413"></a>            <span class="p">]</span>
+</span><span id="__span-0-414"><a id="__codelineno-0-414" name="__codelineno-0-414"></a>        <span class="p">)</span>
+</span><span id="__span-0-415"><a id="__codelineno-0-415" name="__codelineno-0-415"></a>
+</span><span id="__span-0-416"><a id="__codelineno-0-416" name="__codelineno-0-416"></a>    <span class="c1"># Max. chunk size is multiplied with this factor</span>
+</span><span id="__span-0-417"><a id="__codelineno-0-417" name="__codelineno-0-417"></a>    <span class="c1"># (to account for inaccurate chunk sizes due to batching)</span>
+</span><span id="__span-0-418"><a id="__codelineno-0-418" name="__codelineno-0-418"></a>    <span class="n">safety_factor</span> <span class="o">=</span> <span class="mf">0.975</span>
+</span><span id="__span-0-419"><a id="__codelineno-0-419" name="__codelineno-0-419"></a>
+</span><span id="__span-0-420"><a id="__codelineno-0-420" name="__codelineno-0-420"></a>    <span class="c1"># Save as Parquet file</span>
+</span><span id="__span-0-421"><a id="__codelineno-0-421" name="__codelineno-0-421"></a>    <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Writing parquet output (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">output_batch_size</span><span class="si">=}</span><span class="s2">; </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="si">=}</span><span class="s2">; </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">output_compression</span><span class="si">=}</span><span class="s2">)&quot;</span><span class="p">)</span>
+</span><span id="__span-0-422"><a id="__codelineno-0-422" name="__codelineno-0-422"></a>
+</span><span id="__span-0-423"><a id="__codelineno-0-423" name="__codelineno-0-423"></a>    <span class="n">saved_docs</span><span class="p">,</span> <span class="n">saved_chunks</span> <span class="o">=</span> <span class="n">save_texts_to_parquet_chunks</span><span class="p">(</span>
+</span><span id="__span-0-424"><a id="__codelineno-0-424" name="__codelineno-0-424"></a>        <span class="n">texts</span><span class="o">=</span><span class="n">texts</span><span class="p">,</span>
+</span><span id="__span-0-425"><a id="__codelineno-0-425" name="__codelineno-0-425"></a>        <span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">,</span>
+</span><span id="__span-0-426"><a id="__codelineno-0-426" name="__codelineno-0-426"></a>        <span class="n">max_chunk_uncompressed_bytes</span><span class="o">=</span><span class="p">(</span>
+</span><span id="__span-0-427"><a id="__codelineno-0-427" name="__codelineno-0-427"></a>            <span class="bp">self</span><span class="o">.</span><span class="n">max_output_chunk_uncompressed_bytes</span> <span class="o">*</span> <span class="n">safety_factor</span>
+</span><span id="__span-0-428"><a id="__codelineno-0-428" name="__codelineno-0-428"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_output_chunk_uncompressed_bytes</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
+</span><span id="__span-0-429"><a id="__codelineno-0-429" name="__codelineno-0-429"></a>            <span class="k">else</span> <span class="kc">None</span>
+</span><span id="__span-0-430"><a id="__codelineno-0-430" name="__codelineno-0-430"></a>        <span class="p">),</span>
+</span><span id="__span-0-431"><a id="__codelineno-0-431" name="__codelineno-0-431"></a>        <span class="n">max_chunk_rows</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">max_output_chunk_rows</span><span class="p">,</span>
+</span><span id="__span-0-432"><a id="__codelineno-0-432" name="__codelineno-0-432"></a>        <span class="n">output_path_func</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">get_single_or_chunked_output_file_path</span><span class="p">,</span>
+</span><span id="__span-0-433"><a id="__codelineno-0-433" name="__codelineno-0-433"></a>        <span class="n">compression</span><span class="o">=</span><span class="n">get_parquet_compression</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">output_compression</span><span class="p">),</span>
+</span><span id="__span-0-434"><a id="__codelineno-0-434" name="__codelineno-0-434"></a>        <span class="n">batch_size</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">output_batch_size</span><span class="p">,</span>
+</span><span id="__span-0-435"><a id="__codelineno-0-435" name="__codelineno-0-435"></a>        <span class="n">print_write_progress</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">print_write_progress</span><span class="p">,</span>
+</span><span id="__span-0-436"><a id="__codelineno-0-436" name="__codelineno-0-436"></a>        <span class="n">limit</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">,</span>
+</span><span id="__span-0-437"><a id="__codelineno-0-437" name="__codelineno-0-437"></a>    <span class="p">)</span>
+</span><span id="__span-0-438"><a id="__codelineno-0-438" name="__codelineno-0-438"></a>
+</span><span id="__span-0-439"><a id="__codelineno-0-439" name="__codelineno-0-439"></a>    <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">texts</span><span class="p">,</span> <span class="s2">&quot;terminate&quot;</span><span class="p">):</span>
+</span><span id="__span-0-440"><a id="__codelineno-0-440" name="__codelineno-0-440"></a>        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Killing all remaining workers, if any (iterator end)&quot;</span><span class="p">)</span>
+</span><span id="__span-0-441"><a id="__codelineno-0-441" name="__codelineno-0-441"></a>        <span class="n">texts</span><span class="o">.</span><span class="n">terminate</span><span class="p">()</span>
+</span><span id="__span-0-442"><a id="__codelineno-0-442" name="__codelineno-0-442"></a>
+</span><span id="__span-0-443"><a id="__codelineno-0-443" name="__codelineno-0-443"></a>    <span class="k">return</span> <span class="n">saved_docs</span><span class="p">,</span> <span class="n">saved_chunks</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/api/config/index.html b/api/config/index.html
new file mode 100644
index 0000000..7d20bcf
--- /dev/null
+++ b/api/config/index.html
@@ -0,0 +1,980 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/api/config/">
+      
+      
+        <link rel="prev" href="../jsonl_dataset/">
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Config - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#config" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Config
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../datasets/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" checked>
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+        
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.utils.config.Config" class="md-nav__link">
+    <span class="md-ellipsis">
+      Config
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.utils.config.Config.get_job_id" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_job_id
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.utils.config.Config" class="md-nav__link">
+    <span class="md-ellipsis">
+      Config
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.utils.config.Config.get_job_id" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_job_id
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="config">Config</h1>
+
+
+<div class="doc doc-object doc-class">
+
+
+
+<a id="llm_datasets.utils.config.Config"></a>
+    <div class="doc doc-contents first">
+            <p class="doc doc-class-bases">
+              Bases: <code>object</code></p>
+
+
+              <details class="quote">
+                <summary>Source code in <code>src/llm_datasets/utils/config.py</code></summary>
+                <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-64"> 64</a></span>
+<span class="normal"><a href="#__codelineno-0-65"> 65</a></span>
+<span class="normal"><a href="#__codelineno-0-66"> 66</a></span>
+<span class="normal"><a href="#__codelineno-0-67"> 67</a></span>
+<span class="normal"><a href="#__codelineno-0-68"> 68</a></span>
+<span class="normal"><a href="#__codelineno-0-69"> 69</a></span>
+<span class="normal"><a href="#__codelineno-0-70"> 70</a></span>
+<span class="normal"><a href="#__codelineno-0-71"> 71</a></span>
+<span class="normal"><a href="#__codelineno-0-72"> 72</a></span>
+<span class="normal"><a href="#__codelineno-0-73"> 73</a></span>
+<span class="normal"><a href="#__codelineno-0-74"> 74</a></span>
+<span class="normal"><a href="#__codelineno-0-75"> 75</a></span>
+<span class="normal"><a href="#__codelineno-0-76"> 76</a></span>
+<span class="normal"><a href="#__codelineno-0-77"> 77</a></span>
+<span class="normal"><a href="#__codelineno-0-78"> 78</a></span>
+<span class="normal"><a href="#__codelineno-0-79"> 79</a></span>
+<span class="normal"><a href="#__codelineno-0-80"> 80</a></span>
+<span class="normal"><a href="#__codelineno-0-81"> 81</a></span>
+<span class="normal"><a href="#__codelineno-0-82"> 82</a></span>
+<span class="normal"><a href="#__codelineno-0-83"> 83</a></span>
+<span class="normal"><a href="#__codelineno-0-84"> 84</a></span>
+<span class="normal"><a href="#__codelineno-0-85"> 85</a></span>
+<span class="normal"><a href="#__codelineno-0-86"> 86</a></span>
+<span class="normal"><a href="#__codelineno-0-87"> 87</a></span>
+<span class="normal"><a href="#__codelineno-0-88"> 88</a></span>
+<span class="normal"><a href="#__codelineno-0-89"> 89</a></span>
+<span class="normal"><a href="#__codelineno-0-90"> 90</a></span>
+<span class="normal"><a href="#__codelineno-0-91"> 91</a></span>
+<span class="normal"><a href="#__codelineno-0-92"> 92</a></span>
+<span class="normal"><a href="#__codelineno-0-93"> 93</a></span>
+<span class="normal"><a href="#__codelineno-0-94"> 94</a></span>
+<span class="normal"><a href="#__codelineno-0-95"> 95</a></span>
+<span class="normal"><a href="#__codelineno-0-96"> 96</a></span>
+<span class="normal"><a href="#__codelineno-0-97"> 97</a></span>
+<span class="normal"><a href="#__codelineno-0-98"> 98</a></span>
+<span class="normal"><a href="#__codelineno-0-99"> 99</a></span>
+<span class="normal"><a href="#__codelineno-0-100">100</a></span>
+<span class="normal"><a href="#__codelineno-0-101">101</a></span>
+<span class="normal"><a href="#__codelineno-0-102">102</a></span>
+<span class="normal"><a href="#__codelineno-0-103">103</a></span>
+<span class="normal"><a href="#__codelineno-0-104">104</a></span>
+<span class="normal"><a href="#__codelineno-0-105">105</a></span>
+<span class="normal"><a href="#__codelineno-0-106">106</a></span>
+<span class="normal"><a href="#__codelineno-0-107">107</a></span>
+<span class="normal"><a href="#__codelineno-0-108">108</a></span>
+<span class="normal"><a href="#__codelineno-0-109">109</a></span>
+<span class="normal"><a href="#__codelineno-0-110">110</a></span>
+<span class="normal"><a href="#__codelineno-0-111">111</a></span>
+<span class="normal"><a href="#__codelineno-0-112">112</a></span>
+<span class="normal"><a href="#__codelineno-0-113">113</a></span>
+<span class="normal"><a href="#__codelineno-0-114">114</a></span>
+<span class="normal"><a href="#__codelineno-0-115">115</a></span>
+<span class="normal"><a href="#__codelineno-0-116">116</a></span>
+<span class="normal"><a href="#__codelineno-0-117">117</a></span>
+<span class="normal"><a href="#__codelineno-0-118">118</a></span>
+<span class="normal"><a href="#__codelineno-0-119">119</a></span>
+<span class="normal"><a href="#__codelineno-0-120">120</a></span>
+<span class="normal"><a href="#__codelineno-0-121">121</a></span>
+<span class="normal"><a href="#__codelineno-0-122">122</a></span>
+<span class="normal"><a href="#__codelineno-0-123">123</a></span>
+<span class="normal"><a href="#__codelineno-0-124">124</a></span>
+<span class="normal"><a href="#__codelineno-0-125">125</a></span>
+<span class="normal"><a href="#__codelineno-0-126">126</a></span>
+<span class="normal"><a href="#__codelineno-0-127">127</a></span>
+<span class="normal"><a href="#__codelineno-0-128">128</a></span>
+<span class="normal"><a href="#__codelineno-0-129">129</a></span>
+<span class="normal"><a href="#__codelineno-0-130">130</a></span>
+<span class="normal"><a href="#__codelineno-0-131">131</a></span>
+<span class="normal"><a href="#__codelineno-0-132">132</a></span>
+<span class="normal"><a href="#__codelineno-0-133">133</a></span>
+<span class="normal"><a href="#__codelineno-0-134">134</a></span>
+<span class="normal"><a href="#__codelineno-0-135">135</a></span>
+<span class="normal"><a href="#__codelineno-0-136">136</a></span>
+<span class="normal"><a href="#__codelineno-0-137">137</a></span>
+<span class="normal"><a href="#__codelineno-0-138">138</a></span>
+<span class="normal"><a href="#__codelineno-0-139">139</a></span>
+<span class="normal"><a href="#__codelineno-0-140">140</a></span>
+<span class="normal"><a href="#__codelineno-0-141">141</a></span>
+<span class="normal"><a href="#__codelineno-0-142">142</a></span>
+<span class="normal"><a href="#__codelineno-0-143">143</a></span>
+<span class="normal"><a href="#__codelineno-0-144">144</a></span>
+<span class="normal"><a href="#__codelineno-0-145">145</a></span>
+<span class="normal"><a href="#__codelineno-0-146">146</a></span>
+<span class="normal"><a href="#__codelineno-0-147">147</a></span>
+<span class="normal"><a href="#__codelineno-0-148">148</a></span>
+<span class="normal"><a href="#__codelineno-0-149">149</a></span>
+<span class="normal"><a href="#__codelineno-0-150">150</a></span>
+<span class="normal"><a href="#__codelineno-0-151">151</a></span>
+<span class="normal"><a href="#__codelineno-0-152">152</a></span>
+<span class="normal"><a href="#__codelineno-0-153">153</a></span>
+<span class="normal"><a href="#__codelineno-0-154">154</a></span>
+<span class="normal"><a href="#__codelineno-0-155">155</a></span>
+<span class="normal"><a href="#__codelineno-0-156">156</a></span>
+<span class="normal"><a href="#__codelineno-0-157">157</a></span>
+<span class="normal"><a href="#__codelineno-0-158">158</a></span>
+<span class="normal"><a href="#__codelineno-0-159">159</a></span>
+<span class="normal"><a href="#__codelineno-0-160">160</a></span>
+<span class="normal"><a href="#__codelineno-0-161">161</a></span>
+<span class="normal"><a href="#__codelineno-0-162">162</a></span>
+<span class="normal"><a href="#__codelineno-0-163">163</a></span>
+<span class="normal"><a href="#__codelineno-0-164">164</a></span>
+<span class="normal"><a href="#__codelineno-0-165">165</a></span>
+<span class="normal"><a href="#__codelineno-0-166">166</a></span>
+<span class="normal"><a href="#__codelineno-0-167">167</a></span>
+<span class="normal"><a href="#__codelineno-0-168">168</a></span>
+<span class="normal"><a href="#__codelineno-0-169">169</a></span>
+<span class="normal"><a href="#__codelineno-0-170">170</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-64"><a id="__codelineno-0-64" name="__codelineno-0-64"></a><span class="k">class</span> <span class="nc">Config</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span>
+</span><span id="__span-0-65"><a id="__codelineno-0-65" name="__codelineno-0-65"></a>    <span class="n">text_datasets_dir</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-66"><a id="__codelineno-0-66" name="__codelineno-0-66"></a>    <span class="n">output_format</span> <span class="o">=</span> <span class="s2">&quot;jsonl&quot;</span>
+</span><span id="__span-0-67"><a id="__codelineno-0-67" name="__codelineno-0-67"></a>    <span class="n">output_compression</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-68"><a id="__codelineno-0-68" name="__codelineno-0-68"></a>
+</span><span id="__span-0-69"><a id="__codelineno-0-69" name="__codelineno-0-69"></a>    <span class="n">raw_datasets_dir</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-70"><a id="__codelineno-0-70" name="__codelineno-0-70"></a>    <span class="n">shuffled_datasets_dir</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-71"><a id="__codelineno-0-71" name="__codelineno-0-71"></a>
+</span><span id="__span-0-72"><a id="__codelineno-0-72" name="__codelineno-0-72"></a>    <span class="n">composed_dataset_dir</span> <span class="o">=</span> <span class="p">(</span>
+</span><span id="__span-0-73"><a id="__codelineno-0-73" name="__codelineno-0-73"></a>        <span class="kc">None</span>  <span class="c1"># composed dataset (train/val split) is saved into this directory</span>
+</span><span id="__span-0-74"><a id="__codelineno-0-74" name="__codelineno-0-74"></a>    <span class="p">)</span>
+</span><span id="__span-0-75"><a id="__codelineno-0-75" name="__codelineno-0-75"></a>    <span class="n">local_dirs_by_dataset_id</span> <span class="o">=</span> <span class="p">{}</span>
+</span><span id="__span-0-76"><a id="__codelineno-0-76" name="__codelineno-0-76"></a>    <span class="n">local_dirs_by_source_id</span> <span class="o">=</span> <span class="p">{}</span>
+</span><span id="__span-0-77"><a id="__codelineno-0-77" name="__codelineno-0-77"></a>    <span class="n">sampling_factor_by_dataset_id</span> <span class="o">=</span> <span class="p">{}</span>
+</span><span id="__span-0-78"><a id="__codelineno-0-78" name="__codelineno-0-78"></a>    <span class="n">sampling_factor_by_source_id</span> <span class="o">=</span> <span class="p">{}</span>
+</span><span id="__span-0-79"><a id="__codelineno-0-79" name="__codelineno-0-79"></a>    <span class="n">sampling_factor_by_language</span> <span class="o">=</span> <span class="p">{}</span>
+</span><span id="__span-0-80"><a id="__codelineno-0-80" name="__codelineno-0-80"></a>
+</span><span id="__span-0-81"><a id="__codelineno-0-81" name="__codelineno-0-81"></a>    <span class="n">only_selected_datasets</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
+</span><span id="__span-0-82"><a id="__codelineno-0-82" name="__codelineno-0-82"></a>    <span class="n">selected_dataset_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
+</span><span id="__span-0-83"><a id="__codelineno-0-83" name="__codelineno-0-83"></a>    <span class="n">selected_source_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
+</span><span id="__span-0-84"><a id="__codelineno-0-84" name="__codelineno-0-84"></a>
+</span><span id="__span-0-85"><a id="__codelineno-0-85" name="__codelineno-0-85"></a>    <span class="n">validation_ratio</span> <span class="o">=</span> <span class="mf">0.005</span>  <span class="c1"># number of documents in the split: len(dataset) * ratio</span>
+</span><span id="__span-0-86"><a id="__codelineno-0-86" name="__codelineno-0-86"></a>    <span class="n">validation_min_total_docs</span> <span class="o">=</span> <span class="p">(</span>
+</span><span id="__span-0-87"><a id="__codelineno-0-87" name="__codelineno-0-87"></a>        <span class="mi">1_000</span>  <span class="c1"># to be used as validation set, the dataset must have at least n docs</span>
+</span><span id="__span-0-88"><a id="__codelineno-0-88" name="__codelineno-0-88"></a>    <span class="p">)</span>
+</span><span id="__span-0-89"><a id="__codelineno-0-89" name="__codelineno-0-89"></a>    <span class="n">validation_max_split_docs</span> <span class="o">=</span> <span class="p">(</span>
+</span><span id="__span-0-90"><a id="__codelineno-0-90" name="__codelineno-0-90"></a>        <span class="mi">1_000</span>  <span class="c1"># number of documents in validation split are capped at this numbers</span>
+</span><span id="__span-0-91"><a id="__codelineno-0-91" name="__codelineno-0-91"></a>    <span class="p">)</span>
+</span><span id="__span-0-92"><a id="__codelineno-0-92" name="__codelineno-0-92"></a>    <span class="n">validation_min_split_docs</span> <span class="o">=</span> <span class="mi">10</span>  <span class="c1"># split must have at least this number of documents, otherwise it will be discarded</span>
+</span><span id="__span-0-93"><a id="__codelineno-0-93" name="__codelineno-0-93"></a>    <span class="n">tokenizer_train_ratio</span> <span class="o">=</span> <span class="mf">0.1</span>  <span class="c1"># % of train data used for tokenizer training</span>
+</span><span id="__span-0-94"><a id="__codelineno-0-94" name="__codelineno-0-94"></a>
+</span><span id="__span-0-95"><a id="__codelineno-0-95" name="__codelineno-0-95"></a>    <span class="c1"># Vocab size should divisble by 8</span>
+</span><span id="__span-0-96"><a id="__codelineno-0-96" name="__codelineno-0-96"></a>    <span class="c1"># - Jan&#39;s recommendation: 250680</span>
+</span><span id="__span-0-97"><a id="__codelineno-0-97" name="__codelineno-0-97"></a>    <span class="c1"># - NVIDIA recommendation for multilingual models: 256000</span>
+</span><span id="__span-0-98"><a id="__codelineno-0-98" name="__codelineno-0-98"></a>    <span class="n">tokenizer_vocab_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">256000</span>
+</span><span id="__span-0-99"><a id="__codelineno-0-99" name="__codelineno-0-99"></a>    <span class="n">tokenizer_model_type</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span>
+</span><span id="__span-0-100"><a id="__codelineno-0-100" name="__codelineno-0-100"></a>        <span class="s2">&quot;bpe&quot;</span><span class="p">,</span> <span class="s2">&quot;unigram&quot;</span><span class="p">,</span> <span class="s2">&quot;word&quot;</span><span class="p">,</span> <span class="s2">&quot;char&quot;</span>
+</span><span id="__span-0-101"><a id="__codelineno-0-101" name="__codelineno-0-101"></a>    <span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;bpe&quot;</span>  <span class="c1"># SP model types</span>
+</span><span id="__span-0-102"><a id="__codelineno-0-102" name="__codelineno-0-102"></a>
+</span><span id="__span-0-103"><a id="__codelineno-0-103" name="__codelineno-0-103"></a>    <span class="n">seed</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-104"><a id="__codelineno-0-104" name="__codelineno-0-104"></a>
+</span><span id="__span-0-105"><a id="__codelineno-0-105" name="__codelineno-0-105"></a>    <span class="n">extra_dataset_registries</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-106"><a id="__codelineno-0-106" name="__codelineno-0-106"></a>    <span class="n">extra_dataset_classes</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="n">List</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-107"><a id="__codelineno-0-107" name="__codelineno-0-107"></a>    <span class="n">use_default_dataset_registry</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
+</span><span id="__span-0-108"><a id="__codelineno-0-108" name="__codelineno-0-108"></a>
+</span><span id="__span-0-109"><a id="__codelineno-0-109" name="__codelineno-0-109"></a>    <span class="c1"># Datasets are initialized with these kwargs</span>
+</span><span id="__span-0-110"><a id="__codelineno-0-110" name="__codelineno-0-110"></a>    <span class="n">extra_dataset_kwargs</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">dict</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span>
+</span><span id="__span-0-111"><a id="__codelineno-0-111" name="__codelineno-0-111"></a>
+</span><span id="__span-0-112"><a id="__codelineno-0-112" name="__codelineno-0-112"></a>    <span class="n">use_documents</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
+</span><span id="__span-0-113"><a id="__codelineno-0-113" name="__codelineno-0-113"></a>    <span class="n">workers</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-114"><a id="__codelineno-0-114" name="__codelineno-0-114"></a>    <span class="n">limit</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-115"><a id="__codelineno-0-115" name="__codelineno-0-115"></a>    <span class="n">skip_items</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-116"><a id="__codelineno-0-116" name="__codelineno-0-116"></a>    <span class="n">job_id</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-117"><a id="__codelineno-0-117" name="__codelineno-0-117"></a>    <span class="n">save_stats</span> <span class="o">=</span> <span class="kc">True</span>
+</span><span id="__span-0-118"><a id="__codelineno-0-118" name="__codelineno-0-118"></a>    <span class="n">verbose</span> <span class="o">=</span> <span class="kc">False</span>
+</span><span id="__span-0-119"><a id="__codelineno-0-119" name="__codelineno-0-119"></a>    <span class="n">log_file</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-120"><a id="__codelineno-0-120" name="__codelineno-0-120"></a>    <span class="n">override</span> <span class="o">=</span> <span class="kc">False</span>
+</span><span id="__span-0-121"><a id="__codelineno-0-121" name="__codelineno-0-121"></a>
+</span><span id="__span-0-122"><a id="__codelineno-0-122" name="__codelineno-0-122"></a>    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">**</span><span class="n">entries</span><span class="p">):</span>
+</span><span id="__span-0-123"><a id="__codelineno-0-123" name="__codelineno-0-123"></a>        <span class="bp">self</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">entries</span><span class="p">)</span>
+</span><span id="__span-0-124"><a id="__codelineno-0-124" name="__codelineno-0-124"></a>
+</span><span id="__span-0-125"><a id="__codelineno-0-125" name="__codelineno-0-125"></a>    <span class="k">def</span> <span class="nf">init_logger</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">logger_name</span><span class="p">):</span>
+</span><span id="__span-0-126"><a id="__codelineno-0-126" name="__codelineno-0-126"></a>        <span class="n">log_handlers</span> <span class="o">=</span> <span class="p">[</span><span class="n">logging</span><span class="o">.</span><span class="n">StreamHandler</span><span class="p">()]</span>
+</span><span id="__span-0-127"><a id="__codelineno-0-127" name="__codelineno-0-127"></a>
+</span><span id="__span-0-128"><a id="__codelineno-0-128" name="__codelineno-0-128"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">log_file</span><span class="p">:</span>
+</span><span id="__span-0-129"><a id="__codelineno-0-129" name="__codelineno-0-129"></a>            <span class="n">log_handlers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">logging</span><span class="o">.</span><span class="n">FileHandler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">log_file</span><span class="p">))</span>
+</span><span id="__span-0-130"><a id="__codelineno-0-130" name="__codelineno-0-130"></a>
+</span><span id="__span-0-131"><a id="__codelineno-0-131" name="__codelineno-0-131"></a>        <span class="n">logging</span><span class="o">.</span><span class="n">basicConfig</span><span class="p">(</span>
+</span><span id="__span-0-132"><a id="__codelineno-0-132" name="__codelineno-0-132"></a>            <span class="nb">format</span><span class="o">=</span><span class="s2">&quot;</span><span class="si">%(asctime)s</span><span class="s2"> - </span><span class="si">%(levelname)s</span><span class="s2"> - </span><span class="si">%(name)s</span><span class="s2"> -   </span><span class="si">%(message)s</span><span class="s2">&quot;</span><span class="p">,</span>
+</span><span id="__span-0-133"><a id="__codelineno-0-133" name="__codelineno-0-133"></a>            <span class="n">datefmt</span><span class="o">=</span><span class="s2">&quot;%Y-%m-</span><span class="si">%d</span><span class="s2"> %H:%M:%S&quot;</span><span class="p">,</span>
+</span><span id="__span-0-134"><a id="__codelineno-0-134" name="__codelineno-0-134"></a>            <span class="n">level</span><span class="o">=</span><span class="n">logging</span><span class="o">.</span><span class="n">DEBUG</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span> <span class="k">else</span> <span class="n">logging</span><span class="o">.</span><span class="n">INFO</span><span class="p">,</span>
+</span><span id="__span-0-135"><a id="__codelineno-0-135" name="__codelineno-0-135"></a>            <span class="n">handlers</span><span class="o">=</span><span class="n">log_handlers</span><span class="p">,</span>
+</span><span id="__span-0-136"><a id="__codelineno-0-136" name="__codelineno-0-136"></a>        <span class="p">)</span>
+</span><span id="__span-0-137"><a id="__codelineno-0-137" name="__codelineno-0-137"></a>        <span class="n">logger</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">getLogger</span><span class="p">(</span><span class="n">logger_name</span><span class="p">)</span>
+</span><span id="__span-0-138"><a id="__codelineno-0-138" name="__codelineno-0-138"></a>
+</span><span id="__span-0-139"><a id="__codelineno-0-139" name="__codelineno-0-139"></a>        <span class="k">return</span> <span class="n">logger</span>
+</span><span id="__span-0-140"><a id="__codelineno-0-140" name="__codelineno-0-140"></a>
+</span><span id="__span-0-141"><a id="__codelineno-0-141" name="__codelineno-0-141"></a>    <span class="k">def</span> <span class="nf">get_extra_dataset_kwargs</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset_id</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">dict</span><span class="p">:</span>
+</span><span id="__span-0-142"><a id="__codelineno-0-142" name="__codelineno-0-142"></a>        <span class="k">try</span><span class="p">:</span>
+</span><span id="__span-0-143"><a id="__codelineno-0-143" name="__codelineno-0-143"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">extra_dataset_kwargs</span><span class="p">[</span><span class="n">dataset_id</span><span class="p">]</span>
+</span><span id="__span-0-144"><a id="__codelineno-0-144" name="__codelineno-0-144"></a>        <span class="k">except</span> <span class="ne">KeyError</span><span class="p">:</span>
+</span><span id="__span-0-145"><a id="__codelineno-0-145" name="__codelineno-0-145"></a>            <span class="k">return</span> <span class="p">{}</span>
+</span><span id="__span-0-146"><a id="__codelineno-0-146" name="__codelineno-0-146"></a>
+</span><span id="__span-0-147"><a id="__codelineno-0-147" name="__codelineno-0-147"></a>    <span class="k">def</span> <span class="nf">get_selected_dataset_ids</span><span class="p">(</span>
+</span><span id="__span-0-148"><a id="__codelineno-0-148" name="__codelineno-0-148"></a>        <span class="bp">self</span><span class="p">,</span> <span class="n">mode</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span><span class="s2">&quot;all&quot;</span><span class="p">,</span> <span class="s2">&quot;exact&quot;</span><span class="p">,</span> <span class="s2">&quot;fnmatch&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;all&quot;</span>
+</span><span id="__span-0-149"><a id="__codelineno-0-149" name="__codelineno-0-149"></a>    <span class="p">):</span>
+</span><span id="__span-0-150"><a id="__codelineno-0-150" name="__codelineno-0-150"></a>        <span class="k">if</span> <span class="n">mode</span> <span class="o">==</span> <span class="s2">&quot;exact&quot;</span><span class="p">:</span>
+</span><span id="__span-0-151"><a id="__codelineno-0-151" name="__codelineno-0-151"></a>            <span class="c1"># only ids for exact match</span>
+</span><span id="__span-0-152"><a id="__codelineno-0-152" name="__codelineno-0-152"></a>            <span class="k">return</span> <span class="p">[</span>
+</span><span id="__span-0-153"><a id="__codelineno-0-153" name="__codelineno-0-153"></a>                <span class="n">s</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">selected_dataset_ids</span> <span class="k">if</span> <span class="s2">&quot;*&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">s</span> <span class="ow">and</span> <span class="s2">&quot;?&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">s</span>
+</span><span id="__span-0-154"><a id="__codelineno-0-154" name="__codelineno-0-154"></a>            <span class="p">]</span>
+</span><span id="__span-0-155"><a id="__codelineno-0-155" name="__codelineno-0-155"></a>        <span class="k">elif</span> <span class="n">mode</span> <span class="o">==</span> <span class="s2">&quot;fnmatch&quot;</span><span class="p">:</span>
+</span><span id="__span-0-156"><a id="__codelineno-0-156" name="__codelineno-0-156"></a>            <span class="c1"># only ids for fnmatch</span>
+</span><span id="__span-0-157"><a id="__codelineno-0-157" name="__codelineno-0-157"></a>            <span class="k">return</span> <span class="p">[</span><span class="n">s</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">selected_dataset_ids</span> <span class="k">if</span> <span class="s2">&quot;*&quot;</span> <span class="ow">in</span> <span class="n">s</span> <span class="ow">or</span> <span class="s2">&quot;?&quot;</span> <span class="ow">in</span> <span class="n">s</span><span class="p">]</span>
+</span><span id="__span-0-158"><a id="__codelineno-0-158" name="__codelineno-0-158"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-159"><a id="__codelineno-0-159" name="__codelineno-0-159"></a>            <span class="c1"># all</span>
+</span><span id="__span-0-160"><a id="__codelineno-0-160" name="__codelineno-0-160"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">selected_dataset_ids</span>
+</span><span id="__span-0-161"><a id="__codelineno-0-161" name="__codelineno-0-161"></a>
+</span><span id="__span-0-162"><a id="__codelineno-0-162" name="__codelineno-0-162"></a>    <span class="k">def</span> <span class="nf">get_job_id</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="nb">str</span><span class="p">]:</span>
+</span><span id="__span-0-163"><a id="__codelineno-0-163" name="__codelineno-0-163"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;Returns manually set job ID or from environment variable (SLURM_JOBID)&quot;&quot;&quot;</span>
+</span><span id="__span-0-164"><a id="__codelineno-0-164" name="__codelineno-0-164"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">job_id</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-165"><a id="__codelineno-0-165" name="__codelineno-0-165"></a>            <span class="bp">self</span><span class="o">.</span><span class="n">job_id</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;SLURM_JOBID&quot;</span><span class="p">,</span> <span class="s2">&quot;0&quot;</span><span class="p">)</span>
+</span><span id="__span-0-166"><a id="__codelineno-0-166" name="__codelineno-0-166"></a>
+</span><span id="__span-0-167"><a id="__codelineno-0-167" name="__codelineno-0-167"></a>        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">job_id</span>
+</span><span id="__span-0-168"><a id="__codelineno-0-168" name="__codelineno-0-168"></a>
+</span><span id="__span-0-169"><a id="__codelineno-0-169" name="__codelineno-0-169"></a>    <span class="k">def</span> <span class="nf">get_key_value_pairs</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">keys</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Dict</span><span class="p">:</span>
+</span><span id="__span-0-170"><a id="__codelineno-0-170" name="__codelineno-0-170"></a>        <span class="k">return</span> <span class="p">{</span><span class="n">k</span><span class="p">:</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span> <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="n">keys</span><span class="p">}</span>
+</span></code></pre></div></td></tr></table></div>
+              </details>
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.utils.config.Config.get_job_id" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">get_job_id</span><span class="p">()</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Returns manually set job ID or from environment variable (SLURM_JOBID)</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/utils/config.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-162">162</a></span>
+<span class="normal"><a href="#__codelineno-0-163">163</a></span>
+<span class="normal"><a href="#__codelineno-0-164">164</a></span>
+<span class="normal"><a href="#__codelineno-0-165">165</a></span>
+<span class="normal"><a href="#__codelineno-0-166">166</a></span>
+<span class="normal"><a href="#__codelineno-0-167">167</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-162"><a id="__codelineno-0-162" name="__codelineno-0-162"></a><span class="k">def</span> <span class="nf">get_job_id</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="nb">str</span><span class="p">]:</span>
+</span><span id="__span-0-163"><a id="__codelineno-0-163" name="__codelineno-0-163"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Returns manually set job ID or from environment variable (SLURM_JOBID)&quot;&quot;&quot;</span>
+</span><span id="__span-0-164"><a id="__codelineno-0-164" name="__codelineno-0-164"></a>    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">job_id</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-165"><a id="__codelineno-0-165" name="__codelineno-0-165"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">job_id</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;SLURM_JOBID&quot;</span><span class="p">,</span> <span class="s2">&quot;0&quot;</span><span class="p">)</span>
+</span><span id="__span-0-166"><a id="__codelineno-0-166" name="__codelineno-0-166"></a>
+</span><span id="__span-0-167"><a id="__codelineno-0-167" name="__codelineno-0-167"></a>    <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">job_id</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/api/hf_dataset/index.html b/api/hf_dataset/index.html
new file mode 100644
index 0000000..a54ff14
--- /dev/null
+++ b/api/hf_dataset/index.html
@@ -0,0 +1,992 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/api/hf_dataset/">
+      
+      
+        <link rel="prev" href="../base_dataset/">
+      
+      
+        <link rel="next" href="../jsonl_dataset/">
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>HFDataset - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#hfdataset" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              HFDataset
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../datasets/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" checked>
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+        
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.hf_dataset.HFDataset" class="md-nav__link">
+    <span class="md-ellipsis">
+      HFDataset
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.hf_dataset.HFDataset" class="md-nav__link">
+    <span class="md-ellipsis">
+      HFDataset
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="hfdataset">HFDataset</h1>
+
+
+<div class="doc doc-object doc-class">
+
+
+
+<a id="llm_datasets.datasets.hf_dataset.HFDataset"></a>
+    <div class="doc doc-contents first">
+            <p class="doc doc-class-bases">
+              Bases: <code><span title="llm_datasets.datasets.base.BaseDocumentDataset">BaseDocumentDataset</span></code></p>
+
+
+              <details class="quote">
+                <summary>Source code in <code>src/llm_datasets/datasets/hf_dataset.py</code></summary>
+                <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-12"> 12</a></span>
+<span class="normal"><a href="#__codelineno-0-13"> 13</a></span>
+<span class="normal"><a href="#__codelineno-0-14"> 14</a></span>
+<span class="normal"><a href="#__codelineno-0-15"> 15</a></span>
+<span class="normal"><a href="#__codelineno-0-16"> 16</a></span>
+<span class="normal"><a href="#__codelineno-0-17"> 17</a></span>
+<span class="normal"><a href="#__codelineno-0-18"> 18</a></span>
+<span class="normal"><a href="#__codelineno-0-19"> 19</a></span>
+<span class="normal"><a href="#__codelineno-0-20"> 20</a></span>
+<span class="normal"><a href="#__codelineno-0-21"> 21</a></span>
+<span class="normal"><a href="#__codelineno-0-22"> 22</a></span>
+<span class="normal"><a href="#__codelineno-0-23"> 23</a></span>
+<span class="normal"><a href="#__codelineno-0-24"> 24</a></span>
+<span class="normal"><a href="#__codelineno-0-25"> 25</a></span>
+<span class="normal"><a href="#__codelineno-0-26"> 26</a></span>
+<span class="normal"><a href="#__codelineno-0-27"> 27</a></span>
+<span class="normal"><a href="#__codelineno-0-28"> 28</a></span>
+<span class="normal"><a href="#__codelineno-0-29"> 29</a></span>
+<span class="normal"><a href="#__codelineno-0-30"> 30</a></span>
+<span class="normal"><a href="#__codelineno-0-31"> 31</a></span>
+<span class="normal"><a href="#__codelineno-0-32"> 32</a></span>
+<span class="normal"><a href="#__codelineno-0-33"> 33</a></span>
+<span class="normal"><a href="#__codelineno-0-34"> 34</a></span>
+<span class="normal"><a href="#__codelineno-0-35"> 35</a></span>
+<span class="normal"><a href="#__codelineno-0-36"> 36</a></span>
+<span class="normal"><a href="#__codelineno-0-37"> 37</a></span>
+<span class="normal"><a href="#__codelineno-0-38"> 38</a></span>
+<span class="normal"><a href="#__codelineno-0-39"> 39</a></span>
+<span class="normal"><a href="#__codelineno-0-40"> 40</a></span>
+<span class="normal"><a href="#__codelineno-0-41"> 41</a></span>
+<span class="normal"><a href="#__codelineno-0-42"> 42</a></span>
+<span class="normal"><a href="#__codelineno-0-43"> 43</a></span>
+<span class="normal"><a href="#__codelineno-0-44"> 44</a></span>
+<span class="normal"><a href="#__codelineno-0-45"> 45</a></span>
+<span class="normal"><a href="#__codelineno-0-46"> 46</a></span>
+<span class="normal"><a href="#__codelineno-0-47"> 47</a></span>
+<span class="normal"><a href="#__codelineno-0-48"> 48</a></span>
+<span class="normal"><a href="#__codelineno-0-49"> 49</a></span>
+<span class="normal"><a href="#__codelineno-0-50"> 50</a></span>
+<span class="normal"><a href="#__codelineno-0-51"> 51</a></span>
+<span class="normal"><a href="#__codelineno-0-52"> 52</a></span>
+<span class="normal"><a href="#__codelineno-0-53"> 53</a></span>
+<span class="normal"><a href="#__codelineno-0-54"> 54</a></span>
+<span class="normal"><a href="#__codelineno-0-55"> 55</a></span>
+<span class="normal"><a href="#__codelineno-0-56"> 56</a></span>
+<span class="normal"><a href="#__codelineno-0-57"> 57</a></span>
+<span class="normal"><a href="#__codelineno-0-58"> 58</a></span>
+<span class="normal"><a href="#__codelineno-0-59"> 59</a></span>
+<span class="normal"><a href="#__codelineno-0-60"> 60</a></span>
+<span class="normal"><a href="#__codelineno-0-61"> 61</a></span>
+<span class="normal"><a href="#__codelineno-0-62"> 62</a></span>
+<span class="normal"><a href="#__codelineno-0-63"> 63</a></span>
+<span class="normal"><a href="#__codelineno-0-64"> 64</a></span>
+<span class="normal"><a href="#__codelineno-0-65"> 65</a></span>
+<span class="normal"><a href="#__codelineno-0-66"> 66</a></span>
+<span class="normal"><a href="#__codelineno-0-67"> 67</a></span>
+<span class="normal"><a href="#__codelineno-0-68"> 68</a></span>
+<span class="normal"><a href="#__codelineno-0-69"> 69</a></span>
+<span class="normal"><a href="#__codelineno-0-70"> 70</a></span>
+<span class="normal"><a href="#__codelineno-0-71"> 71</a></span>
+<span class="normal"><a href="#__codelineno-0-72"> 72</a></span>
+<span class="normal"><a href="#__codelineno-0-73"> 73</a></span>
+<span class="normal"><a href="#__codelineno-0-74"> 74</a></span>
+<span class="normal"><a href="#__codelineno-0-75"> 75</a></span>
+<span class="normal"><a href="#__codelineno-0-76"> 76</a></span>
+<span class="normal"><a href="#__codelineno-0-77"> 77</a></span>
+<span class="normal"><a href="#__codelineno-0-78"> 78</a></span>
+<span class="normal"><a href="#__codelineno-0-79"> 79</a></span>
+<span class="normal"><a href="#__codelineno-0-80"> 80</a></span>
+<span class="normal"><a href="#__codelineno-0-81"> 81</a></span>
+<span class="normal"><a href="#__codelineno-0-82"> 82</a></span>
+<span class="normal"><a href="#__codelineno-0-83"> 83</a></span>
+<span class="normal"><a href="#__codelineno-0-84"> 84</a></span>
+<span class="normal"><a href="#__codelineno-0-85"> 85</a></span>
+<span class="normal"><a href="#__codelineno-0-86"> 86</a></span>
+<span class="normal"><a href="#__codelineno-0-87"> 87</a></span>
+<span class="normal"><a href="#__codelineno-0-88"> 88</a></span>
+<span class="normal"><a href="#__codelineno-0-89"> 89</a></span>
+<span class="normal"><a href="#__codelineno-0-90"> 90</a></span>
+<span class="normal"><a href="#__codelineno-0-91"> 91</a></span>
+<span class="normal"><a href="#__codelineno-0-92"> 92</a></span>
+<span class="normal"><a href="#__codelineno-0-93"> 93</a></span>
+<span class="normal"><a href="#__codelineno-0-94"> 94</a></span>
+<span class="normal"><a href="#__codelineno-0-95"> 95</a></span>
+<span class="normal"><a href="#__codelineno-0-96"> 96</a></span>
+<span class="normal"><a href="#__codelineno-0-97"> 97</a></span>
+<span class="normal"><a href="#__codelineno-0-98"> 98</a></span>
+<span class="normal"><a href="#__codelineno-0-99"> 99</a></span>
+<span class="normal"><a href="#__codelineno-0-100">100</a></span>
+<span class="normal"><a href="#__codelineno-0-101">101</a></span>
+<span class="normal"><a href="#__codelineno-0-102">102</a></span>
+<span class="normal"><a href="#__codelineno-0-103">103</a></span>
+<span class="normal"><a href="#__codelineno-0-104">104</a></span>
+<span class="normal"><a href="#__codelineno-0-105">105</a></span>
+<span class="normal"><a href="#__codelineno-0-106">106</a></span>
+<span class="normal"><a href="#__codelineno-0-107">107</a></span>
+<span class="normal"><a href="#__codelineno-0-108">108</a></span>
+<span class="normal"><a href="#__codelineno-0-109">109</a></span>
+<span class="normal"><a href="#__codelineno-0-110">110</a></span>
+<span class="normal"><a href="#__codelineno-0-111">111</a></span>
+<span class="normal"><a href="#__codelineno-0-112">112</a></span>
+<span class="normal"><a href="#__codelineno-0-113">113</a></span>
+<span class="normal"><a href="#__codelineno-0-114">114</a></span>
+<span class="normal"><a href="#__codelineno-0-115">115</a></span>
+<span class="normal"><a href="#__codelineno-0-116">116</a></span>
+<span class="normal"><a href="#__codelineno-0-117">117</a></span>
+<span class="normal"><a href="#__codelineno-0-118">118</a></span>
+<span class="normal"><a href="#__codelineno-0-119">119</a></span>
+<span class="normal"><a href="#__codelineno-0-120">120</a></span>
+<span class="normal"><a href="#__codelineno-0-121">121</a></span>
+<span class="normal"><a href="#__codelineno-0-122">122</a></span>
+<span class="normal"><a href="#__codelineno-0-123">123</a></span>
+<span class="normal"><a href="#__codelineno-0-124">124</a></span>
+<span class="normal"><a href="#__codelineno-0-125">125</a></span>
+<span class="normal"><a href="#__codelineno-0-126">126</a></span>
+<span class="normal"><a href="#__codelineno-0-127">127</a></span>
+<span class="normal"><a href="#__codelineno-0-128">128</a></span>
+<span class="normal"><a href="#__codelineno-0-129">129</a></span>
+<span class="normal"><a href="#__codelineno-0-130">130</a></span>
+<span class="normal"><a href="#__codelineno-0-131">131</a></span>
+<span class="normal"><a href="#__codelineno-0-132">132</a></span>
+<span class="normal"><a href="#__codelineno-0-133">133</a></span>
+<span class="normal"><a href="#__codelineno-0-134">134</a></span>
+<span class="normal"><a href="#__codelineno-0-135">135</a></span>
+<span class="normal"><a href="#__codelineno-0-136">136</a></span>
+<span class="normal"><a href="#__codelineno-0-137">137</a></span>
+<span class="normal"><a href="#__codelineno-0-138">138</a></span>
+<span class="normal"><a href="#__codelineno-0-139">139</a></span>
+<span class="normal"><a href="#__codelineno-0-140">140</a></span>
+<span class="normal"><a href="#__codelineno-0-141">141</a></span>
+<span class="normal"><a href="#__codelineno-0-142">142</a></span>
+<span class="normal"><a href="#__codelineno-0-143">143</a></span>
+<span class="normal"><a href="#__codelineno-0-144">144</a></span>
+<span class="normal"><a href="#__codelineno-0-145">145</a></span>
+<span class="normal"><a href="#__codelineno-0-146">146</a></span>
+<span class="normal"><a href="#__codelineno-0-147">147</a></span>
+<span class="normal"><a href="#__codelineno-0-148">148</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-12"><a id="__codelineno-0-12" name="__codelineno-0-12"></a><span class="k">class</span> <span class="nc">HFDataset</span><span class="p">(</span><span class="n">BaseDocumentDataset</span><span class="p">):</span>
+</span><span id="__span-0-13"><a id="__codelineno-0-13" name="__codelineno-0-13"></a>    <span class="n">HF_DATASET_ID</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-14"><a id="__codelineno-0-14" name="__codelineno-0-14"></a>    <span class="n">HF_DATASET_SPLIT</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-15"><a id="__codelineno-0-15" name="__codelineno-0-15"></a>    <span class="n">HF_DATASET_CONFIGS</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-16"><a id="__codelineno-0-16" name="__codelineno-0-16"></a>    <span class="n">HF_DATA_DIR</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-17"><a id="__codelineno-0-17" name="__codelineno-0-17"></a>    <span class="n">HF_KWARGS</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-18"><a id="__codelineno-0-18" name="__codelineno-0-18"></a>    <span class="n">HF_REVISION</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-19"><a id="__codelineno-0-19" name="__codelineno-0-19"></a>
+</span><span id="__span-0-20"><a id="__codelineno-0-20" name="__codelineno-0-20"></a>    <span class="n">config_to_dataset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-21"><a id="__codelineno-0-21" name="__codelineno-0-21"></a>    <span class="n">id_column_name</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-22"><a id="__codelineno-0-22" name="__codelineno-0-22"></a>    <span class="n">text_column_name</span> <span class="o">=</span> <span class="s2">&quot;text&quot;</span>
+</span><span id="__span-0-23"><a id="__codelineno-0-23" name="__codelineno-0-23"></a>    <span class="n">title_column_name</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-24"><a id="__codelineno-0-24" name="__codelineno-0-24"></a>    <span class="n">metadata_column_names</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-25"><a id="__codelineno-0-25" name="__codelineno-0-25"></a>    <span class="n">remove_columns</span> <span class="o">=</span> <span class="kc">None</span>
+</span><span id="__span-0-26"><a id="__codelineno-0-26" name="__codelineno-0-26"></a>    <span class="n">streaming</span> <span class="o">=</span> <span class="kc">False</span>
+</span><span id="__span-0-27"><a id="__codelineno-0-27" name="__codelineno-0-27"></a>    <span class="n">keep_columns</span> <span class="o">=</span> <span class="kc">False</span>
+</span><span id="__span-0-28"><a id="__codelineno-0-28" name="__codelineno-0-28"></a>
+</span><span id="__span-0-29"><a id="__codelineno-0-29" name="__codelineno-0-29"></a>    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-30"><a id="__codelineno-0-30" name="__codelineno-0-30"></a>        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
+</span><span id="__span-0-31"><a id="__codelineno-0-31" name="__codelineno-0-31"></a>
+</span><span id="__span-0-32"><a id="__codelineno-0-32" name="__codelineno-0-32"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">HF_DATASET_ID</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-33"><a id="__codelineno-0-33" name="__codelineno-0-33"></a>            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;HF_DATASET_ID is not set&quot;</span><span class="p">)</span>
+</span><span id="__span-0-34"><a id="__codelineno-0-34" name="__codelineno-0-34"></a>
+</span><span id="__span-0-35"><a id="__codelineno-0-35" name="__codelineno-0-35"></a>    <span class="k">def</span> <span class="nf">get_hf_configs</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-36"><a id="__codelineno-0-36" name="__codelineno-0-36"></a>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">HF_DATASET_CONFIGS</span><span class="p">:</span>
+</span><span id="__span-0-37"><a id="__codelineno-0-37" name="__codelineno-0-37"></a>            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">HF_DATASET_CONFIGS</span>
+</span><span id="__span-0-38"><a id="__codelineno-0-38" name="__codelineno-0-38"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-39"><a id="__codelineno-0-39" name="__codelineno-0-39"></a>            <span class="c1"># if no config is used</span>
+</span><span id="__span-0-40"><a id="__codelineno-0-40" name="__codelineno-0-40"></a>            <span class="k">return</span> <span class="p">[</span><span class="kc">None</span><span class="p">]</span>
+</span><span id="__span-0-41"><a id="__codelineno-0-41" name="__codelineno-0-41"></a>
+</span><span id="__span-0-42"><a id="__codelineno-0-42" name="__codelineno-0-42"></a>    <span class="k">def</span> <span class="nf">download</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-43"><a id="__codelineno-0-43" name="__codelineno-0-43"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">config_to_dataset</span> <span class="o">=</span> <span class="p">{}</span>
+</span><span id="__span-0-44"><a id="__codelineno-0-44" name="__codelineno-0-44"></a>
+</span><span id="__span-0-45"><a id="__codelineno-0-45" name="__codelineno-0-45"></a>        <span class="k">for</span> <span class="n">hf_config</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_hf_configs</span><span class="p">():</span>
+</span><span id="__span-0-46"><a id="__codelineno-0-46" name="__codelineno-0-46"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Downloading for </span><span class="si">{</span><span class="n">hf_config</span><span class="si">=}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-47"><a id="__codelineno-0-47" name="__codelineno-0-47"></a>
+</span><span id="__span-0-48"><a id="__codelineno-0-48" name="__codelineno-0-48"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">HF_KWARGS</span><span class="p">:</span>
+</span><span id="__span-0-49"><a id="__codelineno-0-49" name="__codelineno-0-49"></a>                <span class="c1"># use additional kwargs as defined by dataset class</span>
+</span><span id="__span-0-50"><a id="__codelineno-0-50" name="__codelineno-0-50"></a>                <span class="n">ds</span> <span class="o">=</span> <span class="n">load_dataset</span><span class="p">(</span>
+</span><span id="__span-0-51"><a id="__codelineno-0-51" name="__codelineno-0-51"></a>                    <span class="bp">self</span><span class="o">.</span><span class="n">HF_DATASET_ID</span><span class="p">,</span>
+</span><span id="__span-0-52"><a id="__codelineno-0-52" name="__codelineno-0-52"></a>                    <span class="n">hf_config</span><span class="p">,</span>
+</span><span id="__span-0-53"><a id="__codelineno-0-53" name="__codelineno-0-53"></a>                    <span class="n">split</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">HF_DATASET_SPLIT</span><span class="p">,</span>
+</span><span id="__span-0-54"><a id="__codelineno-0-54" name="__codelineno-0-54"></a>                    <span class="n">data_dir</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">HF_DATA_DIR</span><span class="p">,</span>
+</span><span id="__span-0-55"><a id="__codelineno-0-55" name="__codelineno-0-55"></a>                    <span class="n">streaming</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">streaming</span><span class="p">,</span>
+</span><span id="__span-0-56"><a id="__codelineno-0-56" name="__codelineno-0-56"></a>                    <span class="n">use_auth_token</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">get_hf_auth_token</span><span class="p">(),</span>
+</span><span id="__span-0-57"><a id="__codelineno-0-57" name="__codelineno-0-57"></a>                    <span class="n">keep_in_memory</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+</span><span id="__span-0-58"><a id="__codelineno-0-58" name="__codelineno-0-58"></a>                    <span class="n">revision</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">HF_REVISION</span><span class="p">,</span>
+</span><span id="__span-0-59"><a id="__codelineno-0-59" name="__codelineno-0-59"></a>                    <span class="o">**</span><span class="bp">self</span><span class="o">.</span><span class="n">HF_KWARGS</span><span class="p">,</span>
+</span><span id="__span-0-60"><a id="__codelineno-0-60" name="__codelineno-0-60"></a>                <span class="p">)</span>
+</span><span id="__span-0-61"><a id="__codelineno-0-61" name="__codelineno-0-61"></a>            <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-62"><a id="__codelineno-0-62" name="__codelineno-0-62"></a>                <span class="n">ds</span> <span class="o">=</span> <span class="n">load_dataset</span><span class="p">(</span>
+</span><span id="__span-0-63"><a id="__codelineno-0-63" name="__codelineno-0-63"></a>                    <span class="bp">self</span><span class="o">.</span><span class="n">HF_DATASET_ID</span><span class="p">,</span>
+</span><span id="__span-0-64"><a id="__codelineno-0-64" name="__codelineno-0-64"></a>                    <span class="n">hf_config</span><span class="p">,</span>
+</span><span id="__span-0-65"><a id="__codelineno-0-65" name="__codelineno-0-65"></a>                    <span class="n">split</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">HF_DATASET_SPLIT</span><span class="p">,</span>
+</span><span id="__span-0-66"><a id="__codelineno-0-66" name="__codelineno-0-66"></a>                    <span class="n">data_dir</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">HF_DATA_DIR</span><span class="p">,</span>
+</span><span id="__span-0-67"><a id="__codelineno-0-67" name="__codelineno-0-67"></a>                    <span class="n">streaming</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">streaming</span><span class="p">,</span>
+</span><span id="__span-0-68"><a id="__codelineno-0-68" name="__codelineno-0-68"></a>                    <span class="n">use_auth_token</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">get_hf_auth_token</span><span class="p">(),</span>
+</span><span id="__span-0-69"><a id="__codelineno-0-69" name="__codelineno-0-69"></a>                    <span class="n">keep_in_memory</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+</span><span id="__span-0-70"><a id="__codelineno-0-70" name="__codelineno-0-70"></a>                    <span class="n">revision</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">HF_REVISION</span><span class="p">,</span>
+</span><span id="__span-0-71"><a id="__codelineno-0-71" name="__codelineno-0-71"></a>                <span class="p">)</span>
+</span><span id="__span-0-72"><a id="__codelineno-0-72" name="__codelineno-0-72"></a>
+</span><span id="__span-0-73"><a id="__codelineno-0-73" name="__codelineno-0-73"></a>            <span class="c1"># check dataset split</span>
+</span><span id="__span-0-74"><a id="__codelineno-0-74" name="__codelineno-0-74"></a>            <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ds</span><span class="p">,</span> <span class="n">DatasetDict</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">HF_DATASET_SPLIT</span><span class="p">:</span>
+</span><span id="__span-0-75"><a id="__codelineno-0-75" name="__codelineno-0-75"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;HF returned DatasetDict but split not set: </span><span class="si">{</span><span class="n">DatasetDict</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-76"><a id="__codelineno-0-76" name="__codelineno-0-76"></a>
+</span><span id="__span-0-77"><a id="__codelineno-0-77" name="__codelineno-0-77"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">limit</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
+</span><span id="__span-0-78"><a id="__codelineno-0-78" name="__codelineno-0-78"></a>                <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">streaming</span><span class="p">:</span>
+</span><span id="__span-0-79"><a id="__codelineno-0-79" name="__codelineno-0-79"></a>                    <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Limit requested (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="si">=}</span><span class="s2">) but streaming is enabled!&quot;</span><span class="p">)</span>
+</span><span id="__span-0-80"><a id="__codelineno-0-80" name="__codelineno-0-80"></a>                <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-81"><a id="__codelineno-0-81" name="__codelineno-0-81"></a>                    <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Limiting dataset to: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-82"><a id="__codelineno-0-82" name="__codelineno-0-82"></a>                    <span class="n">ds</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">))</span>
+</span><span id="__span-0-83"><a id="__codelineno-0-83" name="__codelineno-0-83"></a>
+</span><span id="__span-0-84"><a id="__codelineno-0-84" name="__codelineno-0-84"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">remove_columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+</span><span id="__span-0-85"><a id="__codelineno-0-85" name="__codelineno-0-85"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Removing columns (at download): </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">remove_columns</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-86"><a id="__codelineno-0-86" name="__codelineno-0-86"></a>
+</span><span id="__span-0-87"><a id="__codelineno-0-87" name="__codelineno-0-87"></a>                <span class="n">ds</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">remove_columns</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">remove_columns</span><span class="p">)</span>
+</span><span id="__span-0-88"><a id="__codelineno-0-88" name="__codelineno-0-88"></a>
+</span><span id="__span-0-89"><a id="__codelineno-0-89" name="__codelineno-0-89"></a>            <span class="n">filter_func</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_filter_func</span><span class="p">()</span>
+</span><span id="__span-0-90"><a id="__codelineno-0-90" name="__codelineno-0-90"></a>            <span class="k">if</span> <span class="n">filter_func</span><span class="p">:</span>
+</span><span id="__span-0-91"><a id="__codelineno-0-91" name="__codelineno-0-91"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Dataset size before filter: </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">ds</span><span class="p">)</span><span class="si">:</span><span class="s2">,</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-92"><a id="__codelineno-0-92" name="__codelineno-0-92"></a>
+</span><span id="__span-0-93"><a id="__codelineno-0-93" name="__codelineno-0-93"></a>                <span class="n">ds</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">filter_func</span><span class="p">,</span> <span class="n">num_proc</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">workers</span><span class="p">)</span>
+</span><span id="__span-0-94"><a id="__codelineno-0-94" name="__codelineno-0-94"></a>
+</span><span id="__span-0-95"><a id="__codelineno-0-95" name="__codelineno-0-95"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Dataset size after filter: </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">ds</span><span class="p">)</span><span class="si">:</span><span class="s2">,</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-96"><a id="__codelineno-0-96" name="__codelineno-0-96"></a>
+</span><span id="__span-0-97"><a id="__codelineno-0-97" name="__codelineno-0-97"></a>            <span class="bp">self</span><span class="o">.</span><span class="n">config_to_dataset</span><span class="p">[</span><span class="n">hf_config</span><span class="p">]</span> <span class="o">=</span> <span class="n">ds</span>
+</span><span id="__span-0-98"><a id="__codelineno-0-98" name="__codelineno-0-98"></a>
+</span><span id="__span-0-99"><a id="__codelineno-0-99" name="__codelineno-0-99"></a>    <span class="k">def</span> <span class="nf">get_filter_func</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-100"><a id="__codelineno-0-100" name="__codelineno-0-100"></a>        <span class="k">return</span> <span class="kc">None</span>
+</span><span id="__span-0-101"><a id="__codelineno-0-101" name="__codelineno-0-101"></a>
+</span><span id="__span-0-102"><a id="__codelineno-0-102" name="__codelineno-0-102"></a>    <span class="k">def</span> <span class="nf">get_document_from_item</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">,</span> <span class="n">index</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Document</span><span class="p">:</span>
+</span><span id="__span-0-103"><a id="__codelineno-0-103" name="__codelineno-0-103"></a>        <span class="k">return</span> <span class="n">Document</span><span class="p">(</span>
+</span><span id="__span-0-104"><a id="__codelineno-0-104" name="__codelineno-0-104"></a>            <span class="n">text</span><span class="o">=</span><span class="n">item</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">text_column_name</span><span class="p">],</span>
+</span><span id="__span-0-105"><a id="__codelineno-0-105" name="__codelineno-0-105"></a>            <span class="nb">id</span><span class="o">=</span><span class="n">item</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">id_column_name</span><span class="p">]</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">id_column_name</span> <span class="k">else</span> <span class="n">index</span><span class="p">,</span>
+</span><span id="__span-0-106"><a id="__codelineno-0-106" name="__codelineno-0-106"></a>            <span class="n">metadata</span><span class="o">=</span><span class="p">{</span><span class="n">col</span><span class="p">:</span> <span class="n">item</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">metadata_column_names</span><span class="p">}</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">metadata_column_names</span> <span class="k">else</span> <span class="p">{},</span>
+</span><span id="__span-0-107"><a id="__codelineno-0-107" name="__codelineno-0-107"></a>        <span class="p">)</span>
+</span><span id="__span-0-108"><a id="__codelineno-0-108" name="__codelineno-0-108"></a>
+</span><span id="__span-0-109"><a id="__codelineno-0-109" name="__codelineno-0-109"></a>    <span class="k">def</span> <span class="nf">prepend_title</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">example</span><span class="p">):</span>
+</span><span id="__span-0-110"><a id="__codelineno-0-110" name="__codelineno-0-110"></a>        <span class="n">example</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">text_column_name</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span>
+</span><span id="__span-0-111"><a id="__codelineno-0-111" name="__codelineno-0-111"></a>            <span class="n">example</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">title_column_name</span><span class="p">]</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">title_delimiter</span> <span class="o">+</span> <span class="n">example</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">text_column_name</span><span class="p">]</span>
+</span><span id="__span-0-112"><a id="__codelineno-0-112" name="__codelineno-0-112"></a>        <span class="p">)</span>
+</span><span id="__span-0-113"><a id="__codelineno-0-113" name="__codelineno-0-113"></a>
+</span><span id="__span-0-114"><a id="__codelineno-0-114" name="__codelineno-0-114"></a>        <span class="k">return</span> <span class="n">example</span>
+</span><span id="__span-0-115"><a id="__codelineno-0-115" name="__codelineno-0-115"></a>
+</span><span id="__span-0-116"><a id="__codelineno-0-116" name="__codelineno-0-116"></a>    <span class="k">def</span> <span class="nf">get_documents</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Document</span><span class="p">]:</span>
+</span><span id="__span-0-117"><a id="__codelineno-0-117" name="__codelineno-0-117"></a>        <span class="bp">self</span><span class="o">.</span><span class="n">download</span><span class="p">()</span>
+</span><span id="__span-0-118"><a id="__codelineno-0-118" name="__codelineno-0-118"></a>        <span class="n">doc_idx</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-119"><a id="__codelineno-0-119" name="__codelineno-0-119"></a>        <span class="c1"># drop all non-text columns</span>
+</span><span id="__span-0-120"><a id="__codelineno-0-120" name="__codelineno-0-120"></a>        <span class="k">for</span> <span class="n">ds_idx</span><span class="p">,</span> <span class="n">config</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">config_to_dataset</span><span class="p">):</span>
+</span><span id="__span-0-121"><a id="__codelineno-0-121" name="__codelineno-0-121"></a>            <span class="c1"># remove non-text and non-title columns</span>
+</span><span id="__span-0-122"><a id="__codelineno-0-122" name="__codelineno-0-122"></a>            <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">keep_columns</span><span class="p">:</span>
+</span><span id="__span-0-123"><a id="__codelineno-0-123" name="__codelineno-0-123"></a>                <span class="n">columns_to_remove</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">config_to_dataset</span><span class="p">[</span><span class="n">config</span><span class="p">]</span><span class="o">.</span><span class="n">column_names</span><span class="p">)</span> <span class="o">-</span> <span class="p">{</span><span class="bp">self</span><span class="o">.</span><span class="n">text_column_name</span><span class="p">}</span>
+</span><span id="__span-0-124"><a id="__codelineno-0-124" name="__codelineno-0-124"></a>
+</span><span id="__span-0-125"><a id="__codelineno-0-125" name="__codelineno-0-125"></a>                <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">title_column_name</span><span class="p">:</span>
+</span><span id="__span-0-126"><a id="__codelineno-0-126" name="__codelineno-0-126"></a>                    <span class="n">columns_to_remove</span> <span class="o">=</span> <span class="n">columns_to_remove</span> <span class="o">-</span> <span class="p">{</span><span class="bp">self</span><span class="o">.</span><span class="n">title_column_name</span><span class="p">}</span>
+</span><span id="__span-0-127"><a id="__codelineno-0-127" name="__codelineno-0-127"></a>
+</span><span id="__span-0-128"><a id="__codelineno-0-128" name="__codelineno-0-128"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Removing columns (get texts): </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">columns_to_remove</span><span class="p">)</span>
+</span><span id="__span-0-129"><a id="__codelineno-0-129" name="__codelineno-0-129"></a>
+</span><span id="__span-0-130"><a id="__codelineno-0-130" name="__codelineno-0-130"></a>                <span class="bp">self</span><span class="o">.</span><span class="n">config_to_dataset</span><span class="p">[</span><span class="n">config</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">config_to_dataset</span><span class="p">[</span><span class="n">config</span><span class="p">]</span><span class="o">.</span><span class="n">remove_columns</span><span class="p">(</span><span class="n">columns_to_remove</span><span class="p">)</span>
+</span><span id="__span-0-131"><a id="__codelineno-0-131" name="__codelineno-0-131"></a>
+</span><span id="__span-0-132"><a id="__codelineno-0-132" name="__codelineno-0-132"></a>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">title_column_name</span><span class="p">:</span>
+</span><span id="__span-0-133"><a id="__codelineno-0-133" name="__codelineno-0-133"></a>                <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Prepending title to text column (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">title_column_name</span><span class="si">=}</span><span class="s2">)&quot;</span><span class="p">)</span>
+</span><span id="__span-0-134"><a id="__codelineno-0-134" name="__codelineno-0-134"></a>
+</span><span id="__span-0-135"><a id="__codelineno-0-135" name="__codelineno-0-135"></a>                <span class="bp">self</span><span class="o">.</span><span class="n">config_to_dataset</span><span class="p">[</span><span class="n">config</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">config_to_dataset</span><span class="p">[</span><span class="n">config</span><span class="p">]</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">prepend_title</span><span class="p">)</span>
+</span><span id="__span-0-136"><a id="__codelineno-0-136" name="__codelineno-0-136"></a>
+</span><span id="__span-0-137"><a id="__codelineno-0-137" name="__codelineno-0-137"></a>                <span class="c1"># remove title column</span>
+</span><span id="__span-0-138"><a id="__codelineno-0-138" name="__codelineno-0-138"></a>                <span class="bp">self</span><span class="o">.</span><span class="n">config_to_dataset</span><span class="p">[</span><span class="n">config</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">config_to_dataset</span><span class="p">[</span><span class="n">config</span><span class="p">]</span><span class="o">.</span><span class="n">remove_columns</span><span class="p">([</span><span class="bp">self</span><span class="o">.</span><span class="n">title_column_name</span><span class="p">])</span>
+</span><span id="__span-0-139"><a id="__codelineno-0-139" name="__codelineno-0-139"></a>
+</span><span id="__span-0-140"><a id="__codelineno-0-140" name="__codelineno-0-140"></a>            <span class="n">ds_iterator</span> <span class="o">=</span> <span class="nb">iter</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">config_to_dataset</span><span class="p">[</span><span class="n">config</span><span class="p">])</span>
+</span><span id="__span-0-141"><a id="__codelineno-0-141" name="__codelineno-0-141"></a>
+</span><span id="__span-0-142"><a id="__codelineno-0-142" name="__codelineno-0-142"></a>            <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">ds_iterator</span><span class="p">:</span>
+</span><span id="__span-0-143"><a id="__codelineno-0-143" name="__codelineno-0-143"></a>                <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">&quot;get_documents_from_item&quot;</span><span class="p">):</span>
+</span><span id="__span-0-144"><a id="__codelineno-0-144" name="__codelineno-0-144"></a>                    <span class="c1"># multiple documents from a single item</span>
+</span><span id="__span-0-145"><a id="__codelineno-0-145" name="__codelineno-0-145"></a>                    <span class="k">yield from</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_documents_from_item</span><span class="p">(</span><span class="n">item</span><span class="p">)</span>
+</span><span id="__span-0-146"><a id="__codelineno-0-146" name="__codelineno-0-146"></a>                <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-147"><a id="__codelineno-0-147" name="__codelineno-0-147"></a>                    <span class="k">yield</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_document_from_item</span><span class="p">(</span><span class="n">item</span><span class="p">,</span> <span class="n">doc_idx</span><span class="p">)</span>
+</span><span id="__span-0-148"><a id="__codelineno-0-148" name="__codelineno-0-148"></a>                    <span class="n">doc_idx</span> <span class="o">+=</span> <span class="mi">1</span>
+</span></code></pre></div></td></tr></table></div>
+              </details>
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/api/jsonl_dataset/index.html b/api/jsonl_dataset/index.html
new file mode 100644
index 0000000..2d5ec29
--- /dev/null
+++ b/api/jsonl_dataset/index.html
@@ -0,0 +1,1082 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/api/jsonl_dataset/">
+      
+      
+        <link rel="prev" href="../hf_dataset/">
+      
+      
+        <link rel="next" href="../config/">
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>JSONLDataset - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#basedataset" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              JSONLDataset
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../datasets/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" checked>
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+        
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.jsonl_dataset.JSONLDataset" class="md-nav__link">
+    <span class="md-ellipsis">
+      JSONLDataset
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_document_from_item" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_document_from_item
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_text_from_item" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_text_from_item
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_texts" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_texts
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_texts_with_multi_proc" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_texts_with_multi_proc
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_texts_with_single_proc" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_texts_with_single_proc
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.jsonl_dataset.JSONLDataset" class="md-nav__link">
+    <span class="md-ellipsis">
+      JSONLDataset
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_document_from_item" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_document_from_item
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_text_from_item" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_text_from_item
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_texts" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_texts
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_texts_with_multi_proc" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_texts_with_multi_proc
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_texts_with_single_proc" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_texts_with_single_proc
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="basedataset">BaseDataset</h1>
+
+
+<div class="doc doc-object doc-class">
+
+
+
+<a id="llm_datasets.datasets.jsonl_dataset.JSONLDataset"></a>
+    <div class="doc doc-contents first">
+            <p class="doc doc-class-bases">
+              Bases: <code><span title="llm_datasets.datasets.jsonl_dataset.JSONLMixin">JSONLMixin</span></code>, <code><span title="llm_datasets.datasets.base.BaseTextDataset">BaseTextDataset</span></code></p>
+
+
+              <details class="quote">
+                <summary>Source code in <code>src/llm_datasets/datasets/jsonl_dataset.py</code></summary>
+                <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-54"> 54</a></span>
+<span class="normal"><a href="#__codelineno-0-55"> 55</a></span>
+<span class="normal"><a href="#__codelineno-0-56"> 56</a></span>
+<span class="normal"><a href="#__codelineno-0-57"> 57</a></span>
+<span class="normal"><a href="#__codelineno-0-58"> 58</a></span>
+<span class="normal"><a href="#__codelineno-0-59"> 59</a></span>
+<span class="normal"><a href="#__codelineno-0-60"> 60</a></span>
+<span class="normal"><a href="#__codelineno-0-61"> 61</a></span>
+<span class="normal"><a href="#__codelineno-0-62"> 62</a></span>
+<span class="normal"><a href="#__codelineno-0-63"> 63</a></span>
+<span class="normal"><a href="#__codelineno-0-64"> 64</a></span>
+<span class="normal"><a href="#__codelineno-0-65"> 65</a></span>
+<span class="normal"><a href="#__codelineno-0-66"> 66</a></span>
+<span class="normal"><a href="#__codelineno-0-67"> 67</a></span>
+<span class="normal"><a href="#__codelineno-0-68"> 68</a></span>
+<span class="normal"><a href="#__codelineno-0-69"> 69</a></span>
+<span class="normal"><a href="#__codelineno-0-70"> 70</a></span>
+<span class="normal"><a href="#__codelineno-0-71"> 71</a></span>
+<span class="normal"><a href="#__codelineno-0-72"> 72</a></span>
+<span class="normal"><a href="#__codelineno-0-73"> 73</a></span>
+<span class="normal"><a href="#__codelineno-0-74"> 74</a></span>
+<span class="normal"><a href="#__codelineno-0-75"> 75</a></span>
+<span class="normal"><a href="#__codelineno-0-76"> 76</a></span>
+<span class="normal"><a href="#__codelineno-0-77"> 77</a></span>
+<span class="normal"><a href="#__codelineno-0-78"> 78</a></span>
+<span class="normal"><a href="#__codelineno-0-79"> 79</a></span>
+<span class="normal"><a href="#__codelineno-0-80"> 80</a></span>
+<span class="normal"><a href="#__codelineno-0-81"> 81</a></span>
+<span class="normal"><a href="#__codelineno-0-82"> 82</a></span>
+<span class="normal"><a href="#__codelineno-0-83"> 83</a></span>
+<span class="normal"><a href="#__codelineno-0-84"> 84</a></span>
+<span class="normal"><a href="#__codelineno-0-85"> 85</a></span>
+<span class="normal"><a href="#__codelineno-0-86"> 86</a></span>
+<span class="normal"><a href="#__codelineno-0-87"> 87</a></span>
+<span class="normal"><a href="#__codelineno-0-88"> 88</a></span>
+<span class="normal"><a href="#__codelineno-0-89"> 89</a></span>
+<span class="normal"><a href="#__codelineno-0-90"> 90</a></span>
+<span class="normal"><a href="#__codelineno-0-91"> 91</a></span>
+<span class="normal"><a href="#__codelineno-0-92"> 92</a></span>
+<span class="normal"><a href="#__codelineno-0-93"> 93</a></span>
+<span class="normal"><a href="#__codelineno-0-94"> 94</a></span>
+<span class="normal"><a href="#__codelineno-0-95"> 95</a></span>
+<span class="normal"><a href="#__codelineno-0-96"> 96</a></span>
+<span class="normal"><a href="#__codelineno-0-97"> 97</a></span>
+<span class="normal"><a href="#__codelineno-0-98"> 98</a></span>
+<span class="normal"><a href="#__codelineno-0-99"> 99</a></span>
+<span class="normal"><a href="#__codelineno-0-100">100</a></span>
+<span class="normal"><a href="#__codelineno-0-101">101</a></span>
+<span class="normal"><a href="#__codelineno-0-102">102</a></span>
+<span class="normal"><a href="#__codelineno-0-103">103</a></span>
+<span class="normal"><a href="#__codelineno-0-104">104</a></span>
+<span class="normal"><a href="#__codelineno-0-105">105</a></span>
+<span class="normal"><a href="#__codelineno-0-106">106</a></span>
+<span class="normal"><a href="#__codelineno-0-107">107</a></span>
+<span class="normal"><a href="#__codelineno-0-108">108</a></span>
+<span class="normal"><a href="#__codelineno-0-109">109</a></span>
+<span class="normal"><a href="#__codelineno-0-110">110</a></span>
+<span class="normal"><a href="#__codelineno-0-111">111</a></span>
+<span class="normal"><a href="#__codelineno-0-112">112</a></span>
+<span class="normal"><a href="#__codelineno-0-113">113</a></span>
+<span class="normal"><a href="#__codelineno-0-114">114</a></span>
+<span class="normal"><a href="#__codelineno-0-115">115</a></span>
+<span class="normal"><a href="#__codelineno-0-116">116</a></span>
+<span class="normal"><a href="#__codelineno-0-117">117</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-54"><a id="__codelineno-0-54" name="__codelineno-0-54"></a><span class="k">class</span> <span class="nc">JSONLDataset</span><span class="p">(</span><span class="n">JSONLMixin</span><span class="p">,</span> <span class="n">BaseTextDataset</span><span class="p">):</span>  <span class="c1"># TODO rename to JSONLTextDataset</span>
+</span><span id="__span-0-55"><a id="__codelineno-0-55" name="__codelineno-0-55"></a>    <span class="k">def</span> <span class="nf">get_text_from_item</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+</span><span id="__span-0-56"><a id="__codelineno-0-56" name="__codelineno-0-56"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;This simply returns the text field from item (but dataset classes can override this to implement filtering etc.)&quot;&quot;&quot;</span>
+</span><span id="__span-0-57"><a id="__codelineno-0-57" name="__codelineno-0-57"></a>        <span class="k">return</span> <span class="n">item</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">raw_jsonl_text_field</span><span class="p">]</span>
+</span><span id="__span-0-58"><a id="__codelineno-0-58" name="__codelineno-0-58"></a>
+</span><span id="__span-0-59"><a id="__codelineno-0-59" name="__codelineno-0-59"></a>    <span class="k">def</span> <span class="nf">get_document_from_item</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Document</span><span class="p">:</span>
+</span><span id="__span-0-60"><a id="__codelineno-0-60" name="__codelineno-0-60"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;This simply returns the document with a text field from item (but dataset classes can override this to implement filtering etc.)&quot;&quot;&quot;</span>
+</span><span id="__span-0-61"><a id="__codelineno-0-61" name="__codelineno-0-61"></a>        <span class="k">return</span> <span class="n">Document</span><span class="p">(</span><span class="n">text</span><span class="o">=</span><span class="n">item</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">raw_jsonl_text_field</span><span class="p">])</span>
+</span><span id="__span-0-62"><a id="__codelineno-0-62" name="__codelineno-0-62"></a>
+</span><span id="__span-0-63"><a id="__codelineno-0-63" name="__codelineno-0-63"></a>    <span class="k">def</span> <span class="nf">get_texts_from_file_handler</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_handler</span><span class="p">):</span>
+</span><span id="__span-0-64"><a id="__codelineno-0-64" name="__codelineno-0-64"></a>        <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="p">,</span> <span class="s2">&quot;use_documents&quot;</span><span class="p">)</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">use_documents</span><span class="p">:</span>
+</span><span id="__span-0-65"><a id="__codelineno-0-65" name="__codelineno-0-65"></a>            <span class="n">getter_func</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_document_from_item</span>
+</span><span id="__span-0-66"><a id="__codelineno-0-66" name="__codelineno-0-66"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-67"><a id="__codelineno-0-67" name="__codelineno-0-67"></a>            <span class="n">getter_func</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_text_from_item</span>
+</span><span id="__span-0-68"><a id="__codelineno-0-68" name="__codelineno-0-68"></a>
+</span><span id="__span-0-69"><a id="__codelineno-0-69" name="__codelineno-0-69"></a>        <span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="n">file_handler</span><span class="p">:</span>
+</span><span id="__span-0-70"><a id="__codelineno-0-70" name="__codelineno-0-70"></a>            <span class="n">item</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">line</span><span class="p">)</span>
+</span><span id="__span-0-71"><a id="__codelineno-0-71" name="__codelineno-0-71"></a>            <span class="n">text</span> <span class="o">=</span> <span class="n">getter_func</span><span class="p">(</span><span class="n">item</span><span class="p">)</span>
+</span><span id="__span-0-72"><a id="__codelineno-0-72" name="__codelineno-0-72"></a>
+</span><span id="__span-0-73"><a id="__codelineno-0-73" name="__codelineno-0-73"></a>            <span class="k">if</span> <span class="n">text</span><span class="p">:</span>
+</span><span id="__span-0-74"><a id="__codelineno-0-74" name="__codelineno-0-74"></a>                <span class="k">yield</span> <span class="n">text</span>
+</span><span id="__span-0-75"><a id="__codelineno-0-75" name="__codelineno-0-75"></a>
+</span><span id="__span-0-76"><a id="__codelineno-0-76" name="__codelineno-0-76"></a>    <span class="k">def</span> <span class="nf">get_texts_from_file_path</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_path</span><span class="p">:</span> <span class="nb">str</span> <span class="o">|</span> <span class="n">Path</span><span class="p">):</span>
+</span><span id="__span-0-77"><a id="__codelineno-0-77" name="__codelineno-0-77"></a>        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Reading from </span><span class="si">{</span><span class="n">file_path</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</span><span id="__span-0-78"><a id="__codelineno-0-78" name="__codelineno-0-78"></a>
+</span><span id="__span-0-79"><a id="__codelineno-0-79" name="__codelineno-0-79"></a>        <span class="k">if</span> <span class="p">(</span>
+</span><span id="__span-0-80"><a id="__codelineno-0-80" name="__codelineno-0-80"></a>            <span class="nb">isinstance</span><span class="p">(</span><span class="n">file_path</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="ow">and</span> <span class="n">file_path</span><span class="o">.</span><span class="n">endswith</span><span class="p">(</span><span class="s2">&quot;.zst&quot;</span><span class="p">)</span>
+</span><span id="__span-0-81"><a id="__codelineno-0-81" name="__codelineno-0-81"></a>        <span class="p">)</span> <span class="ow">or</span> <span class="n">file_path</span><span class="o">.</span><span class="n">suffix</span> <span class="o">==</span> <span class="s2">&quot;.zst&quot;</span><span class="p">:</span>  <span class="c1"># zstd compression</span>
+</span><span id="__span-0-82"><a id="__codelineno-0-82" name="__codelineno-0-82"></a>            <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">file_path</span><span class="p">,</span> <span class="s2">&quot;rb&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">zf</span><span class="p">:</span>
+</span><span id="__span-0-83"><a id="__codelineno-0-83" name="__codelineno-0-83"></a>                <span class="n">dctx</span> <span class="o">=</span> <span class="n">zstd</span><span class="o">.</span><span class="n">ZstdDecompressor</span><span class="p">()</span>  <span class="c1"># uncompress zstd</span>
+</span><span id="__span-0-84"><a id="__codelineno-0-84" name="__codelineno-0-84"></a>                <span class="k">with</span> <span class="n">dctx</span><span class="o">.</span><span class="n">stream_reader</span><span class="p">(</span><span class="n">zf</span><span class="p">)</span> <span class="k">as</span> <span class="n">reader</span><span class="p">:</span>
+</span><span id="__span-0-85"><a id="__codelineno-0-85" name="__codelineno-0-85"></a>                    <span class="n">f</span> <span class="o">=</span> <span class="n">io</span><span class="o">.</span><span class="n">BufferedReader</span><span class="p">(</span><span class="n">reader</span><span class="p">)</span>
+</span><span id="__span-0-86"><a id="__codelineno-0-86" name="__codelineno-0-86"></a>                    <span class="k">yield from</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_texts_from_file_handler</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
+</span><span id="__span-0-87"><a id="__codelineno-0-87" name="__codelineno-0-87"></a>        <span class="k">else</span><span class="p">:</span>
+</span><span id="__span-0-88"><a id="__codelineno-0-88" name="__codelineno-0-88"></a>            <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>  <span class="c1"># jsonl or jsonl.fz (via smart_open)</span>
+</span><span id="__span-0-89"><a id="__codelineno-0-89" name="__codelineno-0-89"></a>                <span class="k">yield from</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_texts_from_file_handler</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
+</span><span id="__span-0-90"><a id="__codelineno-0-90" name="__codelineno-0-90"></a>
+</span><span id="__span-0-91"><a id="__codelineno-0-91" name="__codelineno-0-91"></a>    <span class="k">def</span> <span class="nf">get_texts</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-92"><a id="__codelineno-0-92" name="__codelineno-0-92"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;Iterate over all input files and read JSON from each line.&quot;&quot;&quot;</span>
+</span><span id="__span-0-93"><a id="__codelineno-0-93" name="__codelineno-0-93"></a>        <span class="c1"># if self.workers == 1:</span>
+</span><span id="__span-0-94"><a id="__codelineno-0-94" name="__codelineno-0-94"></a>        <span class="k">yield from</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_texts_with_single_proc</span><span class="p">()</span>
+</span><span id="__span-0-95"><a id="__codelineno-0-95" name="__codelineno-0-95"></a>        <span class="c1"># else:</span>
+</span><span id="__span-0-96"><a id="__codelineno-0-96" name="__codelineno-0-96"></a>        <span class="c1">#     yield from self.get_texts_with_multi_proc()</span>
+</span><span id="__span-0-97"><a id="__codelineno-0-97" name="__codelineno-0-97"></a>
+</span><span id="__span-0-98"><a id="__codelineno-0-98" name="__codelineno-0-98"></a>    <span class="k">def</span> <span class="nf">get_texts_with_multi_proc</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-99"><a id="__codelineno-0-99" name="__codelineno-0-99"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;Iterate over all input files in parallel and read JSON from each line.&quot;&quot;&quot;</span>
+</span><span id="__span-0-100"><a id="__codelineno-0-100" name="__codelineno-0-100"></a>        <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">()</span>
+</span><span id="__span-0-101"><a id="__codelineno-0-101" name="__codelineno-0-101"></a>        <span class="c1"># # with multiprocessing.Pool(self.workers) as pool:</span>
+</span><span id="__span-0-102"><a id="__codelineno-0-102" name="__codelineno-0-102"></a>        <span class="c1"># with multiprocess.Pool(self.workers) as pool:</span>
+</span><span id="__span-0-103"><a id="__codelineno-0-103" name="__codelineno-0-103"></a>        <span class="c1">#     for text in flatmap(pool, self.get_texts_from_file_path, self.get_raw_jsonl_paths()):</span>
+</span><span id="__span-0-104"><a id="__codelineno-0-104" name="__codelineno-0-104"></a>        <span class="c1">#         yield text</span>
+</span><span id="__span-0-105"><a id="__codelineno-0-105" name="__codelineno-0-105"></a>
+</span><span id="__span-0-106"><a id="__codelineno-0-106" name="__codelineno-0-106"></a>        <span class="c1"># print(&quot;all files done&quot;)</span>
+</span><span id="__span-0-107"><a id="__codelineno-0-107" name="__codelineno-0-107"></a>
+</span><span id="__span-0-108"><a id="__codelineno-0-108" name="__codelineno-0-108"></a>    <span class="k">def</span> <span class="nf">get_texts_with_single_proc</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-109"><a id="__codelineno-0-109" name="__codelineno-0-109"></a><span class="w">        </span><span class="sd">&quot;&quot;&quot;Iterate over all input files and read JSON from each line.&quot;&quot;&quot;</span>
+</span><span id="__span-0-110"><a id="__codelineno-0-110" name="__codelineno-0-110"></a>        <span class="n">processed_files</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-111"><a id="__codelineno-0-111" name="__codelineno-0-111"></a>        <span class="k">for</span> <span class="n">file_path</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_raw_jsonl_paths</span><span class="p">():</span>
+</span><span id="__span-0-112"><a id="__codelineno-0-112" name="__codelineno-0-112"></a>            <span class="k">yield from</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_texts_from_file_path</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span>
+</span><span id="__span-0-113"><a id="__codelineno-0-113" name="__codelineno-0-113"></a>
+</span><span id="__span-0-114"><a id="__codelineno-0-114" name="__codelineno-0-114"></a>            <span class="n">processed_files</span> <span class="o">+=</span> <span class="mi">1</span>
+</span><span id="__span-0-115"><a id="__codelineno-0-115" name="__codelineno-0-115"></a>
+</span><span id="__span-0-116"><a id="__codelineno-0-116" name="__codelineno-0-116"></a>        <span class="k">if</span> <span class="n">processed_files</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+</span><span id="__span-0-117"><a id="__codelineno-0-117" name="__codelineno-0-117"></a>            <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">&quot;No file has been processed.&quot;</span><span class="p">)</span>
+</span></code></pre></div></td></tr></table></div>
+              </details>
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_document_from_item" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">get_document_from_item</span><span class="p">(</span><span class="n">item</span><span class="p">)</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>This simply returns the document with a text field from item (but dataset classes can override this to implement filtering etc.)</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/jsonl_dataset.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-59">59</a></span>
+<span class="normal"><a href="#__codelineno-0-60">60</a></span>
+<span class="normal"><a href="#__codelineno-0-61">61</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-59"><a id="__codelineno-0-59" name="__codelineno-0-59"></a><span class="k">def</span> <span class="nf">get_document_from_item</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Document</span><span class="p">:</span>
+</span><span id="__span-0-60"><a id="__codelineno-0-60" name="__codelineno-0-60"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;This simply returns the document with a text field from item (but dataset classes can override this to implement filtering etc.)&quot;&quot;&quot;</span>
+</span><span id="__span-0-61"><a id="__codelineno-0-61" name="__codelineno-0-61"></a>    <span class="k">return</span> <span class="n">Document</span><span class="p">(</span><span class="n">text</span><span class="o">=</span><span class="n">item</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">raw_jsonl_text_field</span><span class="p">])</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_text_from_item" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">get_text_from_item</span><span class="p">(</span><span class="n">item</span><span class="p">)</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>This simply returns the text field from item (but dataset classes can override this to implement filtering etc.)</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/jsonl_dataset.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-55">55</a></span>
+<span class="normal"><a href="#__codelineno-0-56">56</a></span>
+<span class="normal"><a href="#__codelineno-0-57">57</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-55"><a id="__codelineno-0-55" name="__codelineno-0-55"></a><span class="k">def</span> <span class="nf">get_text_from_item</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+</span><span id="__span-0-56"><a id="__codelineno-0-56" name="__codelineno-0-56"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;This simply returns the text field from item (but dataset classes can override this to implement filtering etc.)&quot;&quot;&quot;</span>
+</span><span id="__span-0-57"><a id="__codelineno-0-57" name="__codelineno-0-57"></a>    <span class="k">return</span> <span class="n">item</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">raw_jsonl_text_field</span><span class="p">]</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_texts" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">get_texts</span><span class="p">()</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Iterate over all input files and read JSON from each line.</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/jsonl_dataset.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-91">91</a></span>
+<span class="normal"><a href="#__codelineno-0-92">92</a></span>
+<span class="normal"><a href="#__codelineno-0-93">93</a></span>
+<span class="normal"><a href="#__codelineno-0-94">94</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-91"><a id="__codelineno-0-91" name="__codelineno-0-91"></a><span class="k">def</span> <span class="nf">get_texts</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-92"><a id="__codelineno-0-92" name="__codelineno-0-92"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Iterate over all input files and read JSON from each line.&quot;&quot;&quot;</span>
+</span><span id="__span-0-93"><a id="__codelineno-0-93" name="__codelineno-0-93"></a>    <span class="c1"># if self.workers == 1:</span>
+</span><span id="__span-0-94"><a id="__codelineno-0-94" name="__codelineno-0-94"></a>    <span class="k">yield from</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_texts_with_single_proc</span><span class="p">()</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_texts_with_multi_proc" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">get_texts_with_multi_proc</span><span class="p">()</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Iterate over all input files in parallel and read JSON from each line.</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/jsonl_dataset.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-98"> 98</a></span>
+<span class="normal"><a href="#__codelineno-0-99"> 99</a></span>
+<span class="normal"><a href="#__codelineno-0-100">100</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-98"><a id="__codelineno-0-98" name="__codelineno-0-98"></a><span class="k">def</span> <span class="nf">get_texts_with_multi_proc</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-99"><a id="__codelineno-0-99" name="__codelineno-0-99"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Iterate over all input files in parallel and read JSON from each line.&quot;&quot;&quot;</span>
+</span><span id="__span-0-100"><a id="__codelineno-0-100" name="__codelineno-0-100"></a>    <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">()</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_texts_with_single_proc" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">get_texts_with_single_proc</span><span class="p">()</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Iterate over all input files and read JSON from each line.</p>
+
+            <details class="quote">
+              <summary>Source code in <code>src/llm_datasets/datasets/jsonl_dataset.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-108">108</a></span>
+<span class="normal"><a href="#__codelineno-0-109">109</a></span>
+<span class="normal"><a href="#__codelineno-0-110">110</a></span>
+<span class="normal"><a href="#__codelineno-0-111">111</a></span>
+<span class="normal"><a href="#__codelineno-0-112">112</a></span>
+<span class="normal"><a href="#__codelineno-0-113">113</a></span>
+<span class="normal"><a href="#__codelineno-0-114">114</a></span>
+<span class="normal"><a href="#__codelineno-0-115">115</a></span>
+<span class="normal"><a href="#__codelineno-0-116">116</a></span>
+<span class="normal"><a href="#__codelineno-0-117">117</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-108"><a id="__codelineno-0-108" name="__codelineno-0-108"></a><span class="k">def</span> <span class="nf">get_texts_with_single_proc</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+</span><span id="__span-0-109"><a id="__codelineno-0-109" name="__codelineno-0-109"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Iterate over all input files and read JSON from each line.&quot;&quot;&quot;</span>
+</span><span id="__span-0-110"><a id="__codelineno-0-110" name="__codelineno-0-110"></a>    <span class="n">processed_files</span> <span class="o">=</span> <span class="mi">0</span>
+</span><span id="__span-0-111"><a id="__codelineno-0-111" name="__codelineno-0-111"></a>    <span class="k">for</span> <span class="n">file_path</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_raw_jsonl_paths</span><span class="p">():</span>
+</span><span id="__span-0-112"><a id="__codelineno-0-112" name="__codelineno-0-112"></a>        <span class="k">yield from</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_texts_from_file_path</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span>
+</span><span id="__span-0-113"><a id="__codelineno-0-113" name="__codelineno-0-113"></a>
+</span><span id="__span-0-114"><a id="__codelineno-0-114" name="__codelineno-0-114"></a>        <span class="n">processed_files</span> <span class="o">+=</span> <span class="mi">1</span>
+</span><span id="__span-0-115"><a id="__codelineno-0-115" name="__codelineno-0-115"></a>
+</span><span id="__span-0-116"><a id="__codelineno-0-116" name="__codelineno-0-116"></a>    <span class="k">if</span> <span class="n">processed_files</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+</span><span id="__span-0-117"><a id="__codelineno-0-117" name="__codelineno-0-117"></a>        <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="s2">&quot;No file has been processed.&quot;</span><span class="p">)</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/assets/_mkdocstrings.css b/assets/_mkdocstrings.css
new file mode 100644
index 0000000..85449ec
--- /dev/null
+++ b/assets/_mkdocstrings.css
@@ -0,0 +1,119 @@
+
+/* Avoid breaking parameter names, etc. in table cells. */
+.doc-contents td code {
+  word-break: normal !important;
+}
+
+/* No line break before first paragraph of descriptions. */
+.doc-md-description,
+.doc-md-description>p:first-child {
+  display: inline;
+}
+
+/* Max width for docstring sections tables. */
+.doc .md-typeset__table,
+.doc .md-typeset__table table {
+  display: table !important;
+  width: 100%;
+}
+
+.doc .md-typeset__table tr {
+  display: table-row;
+}
+
+/* Defaults in Spacy table style. */
+.doc-param-default {
+  float: right;
+}
+
+/* Backward-compatibility: docstring section titles in bold. */
+.doc-section-title {
+  font-weight: bold;
+}
+
+/* Symbols in Navigation and ToC. */
+:root,
+[data-md-color-scheme="default"] {
+  --doc-symbol-attribute-fg-color: #953800;
+  --doc-symbol-function-fg-color: #8250df;
+  --doc-symbol-method-fg-color: #8250df;
+  --doc-symbol-class-fg-color: #0550ae;
+  --doc-symbol-module-fg-color: #5cad0f;
+
+  --doc-symbol-attribute-bg-color: #9538001a;
+  --doc-symbol-function-bg-color: #8250df1a;
+  --doc-symbol-method-bg-color: #8250df1a;
+  --doc-symbol-class-bg-color: #0550ae1a;
+  --doc-symbol-module-bg-color: #5cad0f1a;
+}
+
+[data-md-color-scheme="slate"] {
+  --doc-symbol-attribute-fg-color: #ffa657;
+  --doc-symbol-function-fg-color: #d2a8ff;
+  --doc-symbol-method-fg-color: #d2a8ff;
+  --doc-symbol-class-fg-color: #79c0ff;
+  --doc-symbol-module-fg-color: #baff79;
+
+  --doc-symbol-attribute-bg-color: #ffa6571a;
+  --doc-symbol-function-bg-color: #d2a8ff1a;
+  --doc-symbol-method-bg-color: #d2a8ff1a;
+  --doc-symbol-class-bg-color: #79c0ff1a;
+  --doc-symbol-module-bg-color: #baff791a;
+}
+
+code.doc-symbol {
+  border-radius: .1rem;
+  font-size: .85em;
+  padding: 0 .3em;
+  font-weight: bold;
+}
+
+code.doc-symbol-attribute {
+  color: var(--doc-symbol-attribute-fg-color);
+  background-color: var(--doc-symbol-attribute-bg-color);
+}
+
+code.doc-symbol-attribute::after {
+  content: "attr";
+}
+
+code.doc-symbol-function {
+  color: var(--doc-symbol-function-fg-color);
+  background-color: var(--doc-symbol-function-bg-color);
+}
+
+code.doc-symbol-function::after {
+  content: "func";
+}
+
+code.doc-symbol-method {
+  color: var(--doc-symbol-method-fg-color);
+  background-color: var(--doc-symbol-method-bg-color);
+}
+
+code.doc-symbol-method::after {
+  content: "meth";
+}
+
+code.doc-symbol-class {
+  color: var(--doc-symbol-class-fg-color);
+  background-color: var(--doc-symbol-class-bg-color);
+}
+
+code.doc-symbol-class::after {
+  content: "class";
+}
+
+code.doc-symbol-module {
+  color: var(--doc-symbol-module-fg-color);
+  background-color: var(--doc-symbol-module-bg-color);
+}
+
+code.doc-symbol-module::after {
+  content: "mod";
+}
+
+.doc-signature .autorefs {
+  color: inherit;
+  border-bottom: 1px dotted currentcolor;
+}
diff --git a/assets/images/favicon.png b/assets/images/favicon.png
new file mode 100644
index 0000000000000000000000000000000000000000..1cf13b9f9d978896599290a74f77d5dbe7d1655c
GIT binary patch
literal 1870
zcmV-U2eJ5xP)<h;3K|Lk000e1NJLTq001xm001xu1^@s6R|5Hm000LSNkl<Zc-qC6
z-%nIm5Xa90FNHT=R6u@%Dgvs=DnCU;MPy---z<U;QtPWtY@&@eHnCO{YySmh(Hf(P
zP1GkyjJ45dV`CEmRY7HaUMyMlVfvlhJ)X<ly|^l?lYH`>Gc)JR9QMau)O=X#!i9;T
z37kk-upj^(fsR36MHs_+1RCI)NNu9}lD0S{B^g8PN?Ww(5|~L#Ng*g{WsqleV}|#l
zz8@ri&cTzw_h33bHI+12+kK6WN$h#n5cD8OQt`5kw6p~9H3()bUQ8OS4Q4HTQ=1Ol
z_JAocz`fLbT2^{`8n~UAo=#AUOf=SOq4pYkt;XbC&f#7lb$*7=$na!mWCQ`dBQsO0
zLFBSPj*N?#u5&pf2t4XjEGH|=pPQ8xh7tpx;US<pCUP3oSao8bt`#wjtl{2%^)Z&4
zo$b%d&L$>5Cx_Ju;<?6m<a4ze--*b%^!cz$<mcvkFmxD7OKjxF2-jl2jB`xp%(<eE
z*dGS5y`7fk<)wk(lDs@(mNNPI29BGYUULp=d|+ziK5?!*L&wDoQ_#6T*vpnINdUJ2
zxv^Zv&`7S=dptB02+*TPk1PUb&X8yATF&V+q;GKV%lTWt{VTR$*|LFxrAukKpuk|Q
zQK#b$EnmN$UIv3C?fLWPd<nVV333*qKUDACt)khA<;#;m5YRht5UWJy3vKJ)&YdGS
zJ9GLp*XcdN7r6JF;#rlnq7^F+78Vo`)B-*ueZ6+oiKkD=&Bn*Z$fbvj%bv34fU;Y0
zVPP_uE=F=?Q4ztP4dOE9Ry#2<;eLVf@mS8as~aKY%@4p;MMb+-6&DlWp%~+vHDi1R
zzDC!bKYyOwEHF5z>!O`ya-yF`)b%TEt5>eP1ZX~}sjjA%FJF?h7cX8=b!DZl<6%Cv
z*G0uvvU+vmnpLZ2paivG-(cd*y3$hCIcsZcYOGh{$&)A6*XX&kXZd3G8m)G$Zz-LV
z^GF3VAW^Mdv!)4<hLfyrvBwxF6bjLk$B*fc8#m~elP9UQr6u~Fk`mXm+FD!b*HfqH
z&zm=`i%w5Z(}M>OM8EgqRiz~*Cji;uzl2uC9^=8I84vNp;ltJ|q-*uQwGp2ma6cY7
z;`%`!9UXO@fr&Ebapfs34OmS9^u6$)bJ<hmzumXbvG2S6`)P7=lBCHCh*^vuJvOND
zR~ac=w~p3Fz}oK1n7Dwu@%z52qJpaM!M$+d0>xrucutf>`dKPKT%%*d3XlFVKunp9
zasduxjrjs>f8V=D|J=XNZp;_Zy^WgQ<z;0$u5+WRqO9yyMR_?vxxi9uEqy4%Z;^Nz
zF5sL&tOe5wFY&9&$V**iQ|((hp!F$!D#Xk9r$(jd%gEU(8C(cDt*Wc4#;TFnP*p{&
z)kY;8kM9M>$9WDjgY=z@stwiEBm9u5*|34&1Na8BMjjgf3+SHcr`5~>oz1Y?SW^=K
z^bTyO6>Gar#P<TGubd(JOoOU+<Hi$!Hr3V=tAyi>_W2<cFMIg#q3&Z>gEMwq)ot3;
zREHn~U&Dp0l6YT0&k-wLwYjb?5zGK`W6S2v+K>AM(95m2<X`ZWVnflTty{K`25aFP
zos8oJ9^nG$5i1v!w)(AGcLA!euP0WCJ@@mmV>C20L|3m~rN8dprPr@t)5lsk9Hu*W
z?pS990s;Ez=+Rj{x7p``4>+c0G5^pYnB1^!TL=(?HLHZ<j<v73nGPR1MAxoeqc?Bf
z(EE%peC5g&I)pvg>+HicG{~4F1d^5Awl_2!1jICM-!9eoLhbbT^;yHcefyTAaqRcY
zmuctDopPT!%k+}x%lZRKnzykr2}}XfG_ne?nRQO~?%hkzo;@RN{P6o`&mMUWBYMTe
z6i8ChtjX&gXl`nvrU>jah)2iNM%JdjqoaeaU%yVn!^70x-flljp6Q5tK}5}&X8&&G
zX3fpb3E(!rH=zVI_<WXHgRqJkS*Oq7dU>9Gjl45w@{(ITqngWFe7@9{mX;tO25Z_8
zQHEpI+F<f#k8=(6u^M=vV~<bN`b_^Rkg8&_9Xob-Ko}6y3ZnV~k(X;{j4v<Gh+faL
zl&TN0*49>kTU#4xu>RkN>b3Tnc3UpWzPXWm#o55GKF09j^Mh~)K7{QqbO_~(@CVq!
zS<8954|P8mXN2MRs86xZ&Q4EfM@JB94b=(YGuk)s&^jiSF=t3*oNK3`rD{H`yQ?d;
ztE=laAUoZx5?RC8*WKOj`%LXEkgDd>&^Q4M^z`%u0rg-It=hLCVsq!Z%^6eB-OvOT
zFZ28TN&cRmgU}Elrnk43)!>Z1FCPL2K$7}gwzIc48NX}#!A1BpJP?#v5wkNprhV**
z?Cpalt1oH&{r!o3eSKc&ap)iz2BTn_VV`4>9M^b3;(YY}4>#ML6{~(4mH+?%07*qo
IM6N<$f(jP3KmY&$

literal 0
HcmV?d00001

diff --git a/assets/javascripts/bundle.fe8b6f2b.min.js b/assets/javascripts/bundle.fe8b6f2b.min.js
new file mode 100644
index 0000000..cf778d4
--- /dev/null
+++ b/assets/javascripts/bundle.fe8b6f2b.min.js
@@ -0,0 +1,29 @@
+"use strict";(()=>{var Fi=Object.create;var gr=Object.defineProperty;var ji=Object.getOwnPropertyDescriptor;var Wi=Object.getOwnPropertyNames,Dt=Object.getOwnPropertySymbols,Ui=Object.getPrototypeOf,xr=Object.prototype.hasOwnProperty,no=Object.prototype.propertyIsEnumerable;var oo=(e,t,r)=>t in e?gr(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,R=(e,t)=>{for(var r in t||(t={}))xr.call(t,r)&&oo(e,r,t[r]);if(Dt)for(var r of Dt(t))no.call(t,r)&&oo(e,r,t[r]);return e};var io=(e,t)=>{var r={};for(var o in e)xr.call(e,o)&&t.indexOf(o)<0&&(r[o]=e[o]);if(e!=null&&Dt)for(var o of Dt(e))t.indexOf(o)<0&&no.call(e,o)&&(r[o]=e[o]);return r};var yr=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var Di=(e,t,r,o)=>{if(t&&typeof t=="object"||typeof t=="function")for(let n of Wi(t))!xr.call(e,n)&&n!==r&&gr(e,n,{get:()=>t[n],enumerable:!(o=ji(t,n))||o.enumerable});return e};var Vt=(e,t,r)=>(r=e!=null?Fi(Ui(e)):{},Di(t||!e||!e.__esModule?gr(r,"default",{value:e,enumerable:!0}):r,e));var ao=(e,t,r)=>new Promise((o,n)=>{var i=p=>{try{s(r.next(p))}catch(c){n(c)}},a=p=>{try{s(r.throw(p))}catch(c){n(c)}},s=p=>p.done?o(p.value):Promise.resolve(p.value).then(i,a);s((r=r.apply(e,t)).next())});var co=yr((Er,so)=>{(function(e,t){typeof Er=="object"&&typeof so!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(Er,function(){"use strict";function e(r){var o=!0,n=!1,i=null,a={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function s(H){return!!(H&&H!==document&&H.nodeName!=="HTML"&&H.nodeName!=="BODY"&&"classList"in H&&"contains"in H.classList)}function p(H){var mt=H.type,ze=H.tagName;return!!(ze==="INPUT"&&a[mt]&&!H.readOnly||ze==="TEXTAREA"&&!H.readOnly||H.isContentEditable)}function c(H){H.classList.contains("focus-visible")||(H.classList.add("focus-visible"),H.setAttribute("data-focus-visible-added",""))}function l(H){H.hasAttribute("data-focus-visible-added")&&(H.classList.remove("focus-visible"),H.removeAttribute("data-focus-visible-added"))}function f(H){H.metaKey||H.altKey||H.ctrlKey||(s(r.activeElement)&&c(r.activeElement),o=!0)}function u(H){o=!1}function h(H){s(H.target)&&(o||p(H.target))&&c(H.target)}function w(H){s(H.target)&&(H.target.classList.contains("focus-visible")||H.target.hasAttribute("data-focus-visible-added"))&&(n=!0,window.clearTimeout(i),i=window.setTimeout(function(){n=!1},100),l(H.target))}function A(H){document.visibilityState==="hidden"&&(n&&(o=!0),te())}function te(){document.addEventListener("mousemove",J),document.addEventListener("mousedown",J),document.addEventListener("mouseup",J),document.addEventListener("pointermove",J),document.addEventListener("pointerdown",J),document.addEventListener("pointerup",J),document.addEventListener("touchmove",J),document.addEventListener("touchstart",J),document.addEventListener("touchend",J)}function ie(){document.removeEventListener("mousemove",J),document.removeEventListener("mousedown",J),document.removeEventListener("mouseup",J),document.removeEventListener("pointermove",J),document.removeEventListener("pointerdown",J),document.removeEventListener("pointerup",J),document.removeEventListener("touchmove",J),document.removeEventListener("touchstart",J),document.removeEventListener("touchend",J)}function J(H){H.target.nodeName&&H.target.nodeName.toLowerCase()==="html"||(o=!1,ie())}document.addEventListener("keydown",f,!0),document.addEventListener("mousedown",u,!0),document.addEventListener("pointerdown",u,!0),document.addEventListener("touchstart",u,!0),document.addEventListener("visibilitychange",A,!0),te(),r.addEventListener("focus",h,!0),r.addEventListener("blur",w,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var Yr=yr((Rt,Kr)=>{/*!
+ * clipboard.js v2.0.11
+ * https://clipboardjs.com/
+ *
+ * Licensed MIT © Zeno Rocha
+ */(function(t,r){typeof Rt=="object"&&typeof Kr=="object"?Kr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof Rt=="object"?Rt.ClipboardJS=r():t.ClipboardJS=r()})(Rt,function(){return function(){var e={686:function(o,n,i){"use strict";i.d(n,{default:function(){return Ii}});var a=i(279),s=i.n(a),p=i(370),c=i.n(p),l=i(817),f=i.n(l);function u(V){try{return document.execCommand(V)}catch(_){return!1}}var h=function(_){var M=f()(_);return u("cut"),M},w=h;function A(V){var _=document.documentElement.getAttribute("dir")==="rtl",M=document.createElement("textarea");M.style.fontSize="12pt",M.style.border="0",M.style.padding="0",M.style.margin="0",M.style.position="absolute",M.style[_?"right":"left"]="-9999px";var j=window.pageYOffset||document.documentElement.scrollTop;return M.style.top="".concat(j,"px"),M.setAttribute("readonly",""),M.value=V,M}var te=function(_,M){var j=A(_);M.container.appendChild(j);var D=f()(j);return u("copy"),j.remove(),D},ie=function(_){var M=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},j="";return typeof _=="string"?j=te(_,M):_ instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(_==null?void 0:_.type)?j=te(_.value,M):(j=f()(_),u("copy")),j},J=ie;function H(V){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?H=function(M){return typeof M}:H=function(M){return M&&typeof Symbol=="function"&&M.constructor===Symbol&&M!==Symbol.prototype?"symbol":typeof M},H(V)}var mt=function(){var _=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},M=_.action,j=M===void 0?"copy":M,D=_.container,Y=_.target,ke=_.text;if(j!=="copy"&&j!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(Y!==void 0)if(Y&&H(Y)==="object"&&Y.nodeType===1){if(j==="copy"&&Y.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(j==="cut"&&(Y.hasAttribute("readonly")||Y.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if(ke)return J(ke,{container:D});if(Y)return j==="cut"?w(Y):J(Y,{container:D})},ze=mt;function Ie(V){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?Ie=function(M){return typeof M}:Ie=function(M){return M&&typeof Symbol=="function"&&M.constructor===Symbol&&M!==Symbol.prototype?"symbol":typeof M},Ie(V)}function _i(V,_){if(!(V instanceof _))throw new TypeError("Cannot call a class as a function")}function ro(V,_){for(var M=0;M<_.length;M++){var j=_[M];j.enumerable=j.enumerable||!1,j.configurable=!0,"value"in j&&(j.writable=!0),Object.defineProperty(V,j.key,j)}}function Ai(V,_,M){return _&&ro(V.prototype,_),M&&ro(V,M),V}function Ci(V,_){if(typeof _!="function"&&_!==null)throw new TypeError("Super expression must either be null or a function");V.prototype=Object.create(_&&_.prototype,{constructor:{value:V,writable:!0,configurable:!0}}),_&&br(V,_)}function br(V,_){return br=Object.setPrototypeOf||function(j,D){return j.__proto__=D,j},br(V,_)}function Hi(V){var _=Pi();return function(){var j=Wt(V),D;if(_){var Y=Wt(this).constructor;D=Reflect.construct(j,arguments,Y)}else D=j.apply(this,arguments);return ki(this,D)}}function ki(V,_){return _&&(Ie(_)==="object"||typeof _=="function")?_:$i(V)}function $i(V){if(V===void 0)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return V}function Pi(){if(typeof Reflect=="undefined"||!Reflect.construct||Reflect.construct.sham)return!1;if(typeof Proxy=="function")return!0;try{return Date.prototype.toString.call(Reflect.construct(Date,[],function(){})),!0}catch(V){return!1}}function Wt(V){return Wt=Object.setPrototypeOf?Object.getPrototypeOf:function(M){return M.__proto__||Object.getPrototypeOf(M)},Wt(V)}function vr(V,_){var M="data-clipboard-".concat(V);if(_.hasAttribute(M))return _.getAttribute(M)}var Ri=function(V){Ci(M,V);var _=Hi(M);function M(j,D){var Y;return _i(this,M),Y=_.call(this),Y.resolveOptions(D),Y.listenClick(j),Y}return Ai(M,[{key:"resolveOptions",value:function(){var D=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof D.action=="function"?D.action:this.defaultAction,this.target=typeof D.target=="function"?D.target:this.defaultTarget,this.text=typeof D.text=="function"?D.text:this.defaultText,this.container=Ie(D.container)==="object"?D.container:document.body}},{key:"listenClick",value:function(D){var Y=this;this.listener=c()(D,"click",function(ke){return Y.onClick(ke)})}},{key:"onClick",value:function(D){var Y=D.delegateTarget||D.currentTarget,ke=this.action(Y)||"copy",Ut=ze({action:ke,container:this.container,target:this.target(Y),text:this.text(Y)});this.emit(Ut?"success":"error",{action:ke,text:Ut,trigger:Y,clearSelection:function(){Y&&Y.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(D){return vr("action",D)}},{key:"defaultTarget",value:function(D){var Y=vr("target",D);if(Y)return document.querySelector(Y)}},{key:"defaultText",value:function(D){return vr("text",D)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(D){var Y=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return J(D,Y)}},{key:"cut",value:function(D){return w(D)}},{key:"isSupported",value:function(){var D=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],Y=typeof D=="string"?[D]:D,ke=!!document.queryCommandSupported;return Y.forEach(function(Ut){ke=ke&&!!document.queryCommandSupported(Ut)}),ke}}]),M}(s()),Ii=Ri},828:function(o){var n=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function a(s,p){for(;s&&s.nodeType!==n;){if(typeof s.matches=="function"&&s.matches(p))return s;s=s.parentNode}}o.exports=a},438:function(o,n,i){var a=i(828);function s(l,f,u,h,w){var A=c.apply(this,arguments);return l.addEventListener(u,A,w),{destroy:function(){l.removeEventListener(u,A,w)}}}function p(l,f,u,h,w){return typeof l.addEventListener=="function"?s.apply(null,arguments):typeof u=="function"?s.bind(null,document).apply(null,arguments):(typeof l=="string"&&(l=document.querySelectorAll(l)),Array.prototype.map.call(l,function(A){return s(A,f,u,h,w)}))}function c(l,f,u,h){return function(w){w.delegateTarget=a(w.target,f),w.delegateTarget&&h.call(l,w)}}o.exports=p},879:function(o,n){n.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},n.nodeList=function(i){var a=Object.prototype.toString.call(i);return i!==void 0&&(a==="[object NodeList]"||a==="[object HTMLCollection]")&&"length"in i&&(i.length===0||n.node(i[0]))},n.string=function(i){return typeof i=="string"||i instanceof String},n.fn=function(i){var a=Object.prototype.toString.call(i);return a==="[object Function]"}},370:function(o,n,i){var a=i(879),s=i(438);function p(u,h,w){if(!u&&!h&&!w)throw new Error("Missing required arguments");if(!a.string(h))throw new TypeError("Second argument must be a String");if(!a.fn(w))throw new TypeError("Third argument must be a Function");if(a.node(u))return c(u,h,w);if(a.nodeList(u))return l(u,h,w);if(a.string(u))return f(u,h,w);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function c(u,h,w){return u.addEventListener(h,w),{destroy:function(){u.removeEventListener(h,w)}}}function l(u,h,w){return Array.prototype.forEach.call(u,function(A){A.addEventListener(h,w)}),{destroy:function(){Array.prototype.forEach.call(u,function(A){A.removeEventListener(h,w)})}}}function f(u,h,w){return s(document.body,u,h,w)}o.exports=p},817:function(o){function n(i){var a;if(i.nodeName==="SELECT")i.focus(),a=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var s=i.hasAttribute("readonly");s||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),s||i.removeAttribute("readonly"),a=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var p=window.getSelection(),c=document.createRange();c.selectNodeContents(i),p.removeAllRanges(),p.addRange(c),a=p.toString()}return a}o.exports=n},279:function(o){function n(){}n.prototype={on:function(i,a,s){var p=this.e||(this.e={});return(p[i]||(p[i]=[])).push({fn:a,ctx:s}),this},once:function(i,a,s){var p=this;function c(){p.off(i,c),a.apply(s,arguments)}return c._=a,this.on(i,c,s)},emit:function(i){var a=[].slice.call(arguments,1),s=((this.e||(this.e={}))[i]||[]).slice(),p=0,c=s.length;for(p;p<c;p++)s[p].fn.apply(s[p].ctx,a);return this},off:function(i,a){var s=this.e||(this.e={}),p=s[i],c=[];if(p&&a)for(var l=0,f=p.length;l<f;l++)p[l].fn!==a&&p[l].fn._!==a&&c.push(p[l]);return c.length?s[i]=c:delete s[i],this}},o.exports=n,o.exports.TinyEmitter=n}},t={};function r(o){if(t[o])return t[o].exports;var n=t[o]={exports:{}};return e[o](n,n.exports,r),n.exports}return function(){r.n=function(o){var n=o&&o.__esModule?function(){return o.default}:function(){return o};return r.d(n,{a:n}),n}}(),function(){r.d=function(o,n){for(var i in n)r.o(n,i)&&!r.o(o,i)&&Object.defineProperty(o,i,{enumerable:!0,get:n[i]})}}(),function(){r.o=function(o,n){return Object.prototype.hasOwnProperty.call(o,n)}}(),r(686)}().default})});var ti=yr((gT,ei)=>{"use strict";/*!
+ * escape-html
+ * Copyright(c) 2012-2013 TJ Holowaychuk
+ * Copyright(c) 2015 Andreas Lubbe
+ * Copyright(c) 2015 Tiancheng "Timothy" Gu
+ * MIT Licensed
+ */var ts=/["'&<>]/;ei.exports=rs;function rs(e){var t=""+e,r=ts.exec(t);if(!r)return t;var o,n="",i=0,a=0;for(i=r.index;i<t.length;i++){switch(t.charCodeAt(i)){case 34:o="&quot;";break;case 38:o="&amp;";break;case 39:o="&#39;";break;case 60:o="&lt;";break;case 62:o="&gt;";break;default:continue}a!==i&&(n+=t.substring(a,i)),a=i+1,n+=o}return a!==i?n+t.substring(a,i):n}});var t0=Vt(co());/*! *****************************************************************************
+Copyright (c) Microsoft Corporation.
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+***************************************************************************** */var wr=function(e,t){return wr=Object.setPrototypeOf||{__proto__:[]}instanceof Array&&function(r,o){r.__proto__=o}||function(r,o){for(var n in o)Object.prototype.hasOwnProperty.call(o,n)&&(r[n]=o[n])},wr(e,t)};function re(e,t){if(typeof t!="function"&&t!==null)throw new TypeError("Class extends value "+String(t)+" is not a constructor or null");wr(e,t);function r(){this.constructor=e}e.prototype=t===null?Object.create(t):(r.prototype=t.prototype,new r)}function po(e,t,r,o){function n(i){return i instanceof r?i:new r(function(a){a(i)})}return new(r||(r=Promise))(function(i,a){function s(l){try{c(o.next(l))}catch(f){a(f)}}function p(l){try{c(o.throw(l))}catch(f){a(f)}}function c(l){l.done?i(l.value):n(l.value).then(s,p)}c((o=o.apply(e,t||[])).next())})}function Nt(e,t){var r={label:0,sent:function(){if(i[0]&1)throw i[1];return i[1]},trys:[],ops:[]},o,n,i,a;return a={next:s(0),throw:s(1),return:s(2)},typeof Symbol=="function"&&(a[Symbol.iterator]=function(){return this}),a;function s(c){return function(l){return p([c,l])}}function p(c){if(o)throw new TypeError("Generator is already executing.");for(;r;)try{if(o=1,n&&(i=c[0]&2?n.return:c[0]?n.throw||((i=n.return)&&i.call(n),0):n.next)&&!(i=i.call(n,c[1])).done)return i;switch(n=0,i&&(c=[c[0]&2,i.value]),c[0]){case 0:case 1:i=c;break;case 4:return r.label++,{value:c[1],done:!1};case 5:r.label++,n=c[1],c=[0];continue;case 7:c=r.ops.pop(),r.trys.pop();continue;default:if(i=r.trys,!(i=i.length>0&&i[i.length-1])&&(c[0]===6||c[0]===2)){r=0;continue}if(c[0]===3&&(!i||c[1]>i[0]&&c[1]<i[3])){r.label=c[1];break}if(c[0]===6&&r.label<i[1]){r.label=i[1],i=c;break}if(i&&r.label<i[2]){r.label=i[2],r.ops.push(c);break}i[2]&&r.ops.pop(),r.trys.pop();continue}c=t.call(e,r)}catch(l){c=[6,l],n=0}finally{o=i=0}if(c[0]&5)throw c[1];return{value:c[0]?c[1]:void 0,done:!0}}}function de(e){var t=typeof Symbol=="function"&&Symbol.iterator,r=t&&e[t],o=0;if(r)return r.call(e);if(e&&typeof e.length=="number")return{next:function(){return e&&o>=e.length&&(e=void 0),{value:e&&e[o++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function N(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var o=r.call(e),n,i=[],a;try{for(;(t===void 0||t-- >0)&&!(n=o.next()).done;)i.push(n.value)}catch(s){a={error:s}}finally{try{n&&!n.done&&(r=o.return)&&r.call(o)}finally{if(a)throw a.error}}return i}function q(e,t,r){if(r||arguments.length===2)for(var o=0,n=t.length,i;o<n;o++)(i||!(o in t))&&(i||(i=Array.prototype.slice.call(t,0,o)),i[o]=t[o]);return e.concat(i||Array.prototype.slice.call(t))}function nt(e){return this instanceof nt?(this.v=e,this):new nt(e)}function lo(e,t,r){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var o=r.apply(e,t||[]),n,i=[];return n={},a("next"),a("throw"),a("return"),n[Symbol.asyncIterator]=function(){return this},n;function a(u){o[u]&&(n[u]=function(h){return new Promise(function(w,A){i.push([u,h,w,A])>1||s(u,h)})})}function s(u,h){try{p(o[u](h))}catch(w){f(i[0][3],w)}}function p(u){u.value instanceof nt?Promise.resolve(u.value.v).then(c,l):f(i[0][2],u)}function c(u){s("next",u)}function l(u){s("throw",u)}function f(u,h){u(h),i.shift(),i.length&&s(i[0][0],i[0][1])}}function mo(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof de=="function"?de(e):e[Symbol.iterator](),r={},o("next"),o("throw"),o("return"),r[Symbol.asyncIterator]=function(){return this},r);function o(i){r[i]=e[i]&&function(a){return new Promise(function(s,p){a=e[i](a),n(s,p,a.done,a.value)})}}function n(i,a,s,p){Promise.resolve(p).then(function(c){i({value:c,done:s})},a)}}function k(e){return typeof e=="function"}function ft(e){var t=function(o){Error.call(o),o.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var zt=ft(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription:
+`+r.map(function(o,n){return n+1+") "+o.toString()}).join(`
+  `):"",this.name="UnsubscriptionError",this.errors=r}});function qe(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Fe=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,o,n,i;if(!this.closed){this.closed=!0;var a=this._parentage;if(a)if(this._parentage=null,Array.isArray(a))try{for(var s=de(a),p=s.next();!p.done;p=s.next()){var c=p.value;c.remove(this)}}catch(A){t={error:A}}finally{try{p&&!p.done&&(r=s.return)&&r.call(s)}finally{if(t)throw t.error}}else a.remove(this);var l=this.initialTeardown;if(k(l))try{l()}catch(A){i=A instanceof zt?A.errors:[A]}var f=this._finalizers;if(f){this._finalizers=null;try{for(var u=de(f),h=u.next();!h.done;h=u.next()){var w=h.value;try{fo(w)}catch(A){i=i!=null?i:[],A instanceof zt?i=q(q([],N(i)),N(A.errors)):i.push(A)}}}catch(A){o={error:A}}finally{try{h&&!h.done&&(n=u.return)&&n.call(u)}finally{if(o)throw o.error}}}if(i)throw new zt(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)fo(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&qe(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&qe(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Tr=Fe.EMPTY;function qt(e){return e instanceof Fe||e&&"closed"in e&&k(e.remove)&&k(e.add)&&k(e.unsubscribe)}function fo(e){k(e)?e():e.unsubscribe()}var $e={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var ut={setTimeout:function(e,t){for(var r=[],o=2;o<arguments.length;o++)r[o-2]=arguments[o];var n=ut.delegate;return n!=null&&n.setTimeout?n.setTimeout.apply(n,q([e,t],N(r))):setTimeout.apply(void 0,q([e,t],N(r)))},clearTimeout:function(e){var t=ut.delegate;return((t==null?void 0:t.clearTimeout)||clearTimeout)(e)},delegate:void 0};function Qt(e){ut.setTimeout(function(){var t=$e.onUnhandledError;if(t)t(e);else throw e})}function he(){}var uo=function(){return Sr("C",void 0,void 0)}();function ho(e){return Sr("E",void 0,e)}function bo(e){return Sr("N",e,void 0)}function Sr(e,t,r){return{kind:e,value:t,error:r}}var it=null;function dt(e){if($e.useDeprecatedSynchronousErrorHandling){var t=!it;if(t&&(it={errorThrown:!1,error:null}),e(),t){var r=it,o=r.errorThrown,n=r.error;if(it=null,o)throw n}}else e()}function vo(e){$e.useDeprecatedSynchronousErrorHandling&&it&&(it.errorThrown=!0,it.error=e)}var Mt=function(e){re(t,e);function t(r){var o=e.call(this)||this;return o.isStopped=!1,r?(o.destination=r,qt(r)&&r.add(o)):o.destination=qi,o}return t.create=function(r,o,n){return new at(r,o,n)},t.prototype.next=function(r){this.isStopped?Mr(bo(r),this):this._next(r)},t.prototype.error=function(r){this.isStopped?Mr(ho(r),this):(this.isStopped=!0,this._error(r))},t.prototype.complete=function(){this.isStopped?Mr(uo,this):(this.isStopped=!0,this._complete())},t.prototype.unsubscribe=function(){this.closed||(this.isStopped=!0,e.prototype.unsubscribe.call(this),this.destination=null)},t.prototype._next=function(r){this.destination.next(r)},t.prototype._error=function(r){try{this.destination.error(r)}finally{this.unsubscribe()}},t.prototype._complete=function(){try{this.destination.complete()}finally{this.unsubscribe()}},t}(Fe);var Vi=Function.prototype.bind;function Or(e,t){return Vi.call(e,t)}var Ni=function(){function e(t){this.partialObserver=t}return e.prototype.next=function(t){var r=this.partialObserver;if(r.next)try{r.next(t)}catch(o){Kt(o)}},e.prototype.error=function(t){var r=this.partialObserver;if(r.error)try{r.error(t)}catch(o){Kt(o)}else Kt(t)},e.prototype.complete=function(){var t=this.partialObserver;if(t.complete)try{t.complete()}catch(r){Kt(r)}},e}(),at=function(e){re(t,e);function t(r,o,n){var i=e.call(this)||this,a;if(k(r)||!r)a={next:r!=null?r:void 0,error:o!=null?o:void 0,complete:n!=null?n:void 0};else{var s;i&&$e.useDeprecatedNextContext?(s=Object.create(r),s.unsubscribe=function(){return i.unsubscribe()},a={next:r.next&&Or(r.next,s),error:r.error&&Or(r.error,s),complete:r.complete&&Or(r.complete,s)}):a=r}return i.destination=new Ni(a),i}return t}(Mt);function Kt(e){$e.useDeprecatedSynchronousErrorHandling?vo(e):Qt(e)}function zi(e){throw e}function Mr(e,t){var r=$e.onStoppedNotification;r&&ut.setTimeout(function(){return r(e,t)})}var qi={closed:!0,next:he,error:zi,complete:he};var ht=function(){return typeof Symbol=="function"&&Symbol.observable||"@@observable"}();function le(e){return e}function go(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];return Lr(e)}function Lr(e){return e.length===0?le:e.length===1?e[0]:function(r){return e.reduce(function(o,n){return n(o)},r)}}var F=function(){function e(t){t&&(this._subscribe=t)}return e.prototype.lift=function(t){var r=new e;return r.source=this,r.operator=t,r},e.prototype.subscribe=function(t,r,o){var n=this,i=Ki(t)?t:new at(t,r,o);return dt(function(){var a=n,s=a.operator,p=a.source;i.add(s?s.call(i,p):p?n._subscribe(i):n._trySubscribe(i))}),i},e.prototype._trySubscribe=function(t){try{return this._subscribe(t)}catch(r){t.error(r)}},e.prototype.forEach=function(t,r){var o=this;return r=xo(r),new r(function(n,i){var a=new at({next:function(s){try{t(s)}catch(p){i(p),a.unsubscribe()}},error:i,complete:n});o.subscribe(a)})},e.prototype._subscribe=function(t){var r;return(r=this.source)===null||r===void 0?void 0:r.subscribe(t)},e.prototype[ht]=function(){return this},e.prototype.pipe=function(){for(var t=[],r=0;r<arguments.length;r++)t[r]=arguments[r];return Lr(t)(this)},e.prototype.toPromise=function(t){var r=this;return t=xo(t),new t(function(o,n){var i;r.subscribe(function(a){return i=a},function(a){return n(a)},function(){return o(i)})})},e.create=function(t){return new e(t)},e}();function xo(e){var t;return(t=e!=null?e:$e.Promise)!==null&&t!==void 0?t:Promise}function Qi(e){return e&&k(e.next)&&k(e.error)&&k(e.complete)}function Ki(e){return e&&e instanceof Mt||Qi(e)&&qt(e)}function Yi(e){return k(e==null?void 0:e.lift)}function y(e){return function(t){if(Yi(t))return t.lift(function(r){try{return e(r,this)}catch(o){this.error(o)}});throw new TypeError("Unable to lift unknown Observable type")}}function T(e,t,r,o,n){return new Bi(e,t,r,o,n)}var Bi=function(e){re(t,e);function t(r,o,n,i,a,s){var p=e.call(this,r)||this;return p.onFinalize=a,p.shouldUnsubscribe=s,p._next=o?function(c){try{o(c)}catch(l){r.error(l)}}:e.prototype._next,p._error=i?function(c){try{i(c)}catch(l){r.error(l)}finally{this.unsubscribe()}}:e.prototype._error,p._complete=n?function(){try{n()}catch(c){r.error(c)}finally{this.unsubscribe()}}:e.prototype._complete,p}return t.prototype.unsubscribe=function(){var r;if(!this.shouldUnsubscribe||this.shouldUnsubscribe()){var o=this.closed;e.prototype.unsubscribe.call(this),!o&&((r=this.onFinalize)===null||r===void 0||r.call(this))}},t}(Mt);var bt={schedule:function(e){var t=requestAnimationFrame,r=cancelAnimationFrame,o=bt.delegate;o&&(t=o.requestAnimationFrame,r=o.cancelAnimationFrame);var n=t(function(i){r=void 0,e(i)});return new Fe(function(){return r==null?void 0:r(n)})},requestAnimationFrame:function(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];var r=bt.delegate;return((r==null?void 0:r.requestAnimationFrame)||requestAnimationFrame).apply(void 0,q([],N(e)))},cancelAnimationFrame:function(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];var r=bt.delegate;return((r==null?void 0:r.cancelAnimationFrame)||cancelAnimationFrame).apply(void 0,q([],N(e)))},delegate:void 0};var yo=ft(function(e){return function(){e(this),this.name="ObjectUnsubscribedError",this.message="object unsubscribed"}});var g=function(e){re(t,e);function t(){var r=e.call(this)||this;return r.closed=!1,r.currentObservers=null,r.observers=[],r.isStopped=!1,r.hasError=!1,r.thrownError=null,r}return t.prototype.lift=function(r){var o=new Eo(this,this);return o.operator=r,o},t.prototype._throwIfClosed=function(){if(this.closed)throw new yo},t.prototype.next=function(r){var o=this;dt(function(){var n,i;if(o._throwIfClosed(),!o.isStopped){o.currentObservers||(o.currentObservers=Array.from(o.observers));try{for(var a=de(o.currentObservers),s=a.next();!s.done;s=a.next()){var p=s.value;p.next(r)}}catch(c){n={error:c}}finally{try{s&&!s.done&&(i=a.return)&&i.call(a)}finally{if(n)throw n.error}}}})},t.prototype.error=function(r){var o=this;dt(function(){if(o._throwIfClosed(),!o.isStopped){o.hasError=o.isStopped=!0,o.thrownError=r;for(var n=o.observers;n.length;)n.shift().error(r)}})},t.prototype.complete=function(){var r=this;dt(function(){if(r._throwIfClosed(),!r.isStopped){r.isStopped=!0;for(var o=r.observers;o.length;)o.shift().complete()}})},t.prototype.unsubscribe=function(){this.isStopped=this.closed=!0,this.observers=this.currentObservers=null},Object.defineProperty(t.prototype,"observed",{get:function(){var r;return((r=this.observers)===null||r===void 0?void 0:r.length)>0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var o=this,n=this,i=n.hasError,a=n.isStopped,s=n.observers;return i||a?Tr:(this.currentObservers=null,s.push(r),new Fe(function(){o.currentObservers=null,qe(s,r)}))},t.prototype._checkFinalizedStatuses=function(r){var o=this,n=o.hasError,i=o.thrownError,a=o.isStopped;n?r.error(i):a&&r.complete()},t.prototype.asObservable=function(){var r=new F;return r.source=this,r},t.create=function(r,o){return new Eo(r,o)},t}(F);var Eo=function(e){re(t,e);function t(r,o){var n=e.call(this)||this;return n.destination=r,n.source=o,n}return t.prototype.next=function(r){var o,n;(n=(o=this.destination)===null||o===void 0?void 0:o.next)===null||n===void 0||n.call(o,r)},t.prototype.error=function(r){var o,n;(n=(o=this.destination)===null||o===void 0?void 0:o.error)===null||n===void 0||n.call(o,r)},t.prototype.complete=function(){var r,o;(o=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||o===void 0||o.call(r)},t.prototype._subscribe=function(r){var o,n;return(n=(o=this.source)===null||o===void 0?void 0:o.subscribe(r))!==null&&n!==void 0?n:Tr},t}(g);var _r=function(e){re(t,e);function t(r){var o=e.call(this)||this;return o._value=r,o}return Object.defineProperty(t.prototype,"value",{get:function(){return this.getValue()},enumerable:!1,configurable:!0}),t.prototype._subscribe=function(r){var o=e.prototype._subscribe.call(this,r);return!o.closed&&r.next(this._value),o},t.prototype.getValue=function(){var r=this,o=r.hasError,n=r.thrownError,i=r._value;if(o)throw n;return this._throwIfClosed(),i},t.prototype.next=function(r){e.prototype.next.call(this,this._value=r)},t}(g);var Lt={now:function(){return(Lt.delegate||Date).now()},delegate:void 0};var _t=function(e){re(t,e);function t(r,o,n){r===void 0&&(r=1/0),o===void 0&&(o=1/0),n===void 0&&(n=Lt);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=o,i._timestampProvider=n,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=o===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,o),i}return t.prototype.next=function(r){var o=this,n=o.isStopped,i=o._buffer,a=o._infiniteTimeWindow,s=o._timestampProvider,p=o._windowTime;n||(i.push(r),!a&&i.push(s.now()+p)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var o=this._innerSubscribe(r),n=this,i=n._infiniteTimeWindow,a=n._buffer,s=a.slice(),p=0;p<s.length&&!r.closed;p+=i?1:2)r.next(s[p]);return this._checkFinalizedStatuses(r),o},t.prototype._trimBuffer=function(){var r=this,o=r._bufferSize,n=r._timestampProvider,i=r._buffer,a=r._infiniteTimeWindow,s=(a?1:2)*o;if(o<1/0&&s<i.length&&i.splice(0,i.length-s),!a){for(var p=n.now(),c=0,l=1;l<i.length&&i[l]<=p;l+=2)c=l;c&&i.splice(0,c+1)}},t}(g);var wo=function(e){re(t,e);function t(r,o){return e.call(this)||this}return t.prototype.schedule=function(r,o){return o===void 0&&(o=0),this},t}(Fe);var At={setInterval:function(e,t){for(var r=[],o=2;o<arguments.length;o++)r[o-2]=arguments[o];var n=At.delegate;return n!=null&&n.setInterval?n.setInterval.apply(n,q([e,t],N(r))):setInterval.apply(void 0,q([e,t],N(r)))},clearInterval:function(e){var t=At.delegate;return((t==null?void 0:t.clearInterval)||clearInterval)(e)},delegate:void 0};var vt=function(e){re(t,e);function t(r,o){var n=e.call(this,r,o)||this;return n.scheduler=r,n.work=o,n.pending=!1,n}return t.prototype.schedule=function(r,o){var n;if(o===void 0&&(o=0),this.closed)return this;this.state=r;var i=this.id,a=this.scheduler;return i!=null&&(this.id=this.recycleAsyncId(a,i,o)),this.pending=!0,this.delay=o,this.id=(n=this.id)!==null&&n!==void 0?n:this.requestAsyncId(a,this.id,o),this},t.prototype.requestAsyncId=function(r,o,n){return n===void 0&&(n=0),At.setInterval(r.flush.bind(r,this),n)},t.prototype.recycleAsyncId=function(r,o,n){if(n===void 0&&(n=0),n!=null&&this.delay===n&&this.pending===!1)return o;o!=null&&At.clearInterval(o)},t.prototype.execute=function(r,o){if(this.closed)return new Error("executing a cancelled action");this.pending=!1;var n=this._execute(r,o);if(n)return n;this.pending===!1&&this.id!=null&&(this.id=this.recycleAsyncId(this.scheduler,this.id,null))},t.prototype._execute=function(r,o){var n=!1,i;try{this.work(r)}catch(a){n=!0,i=a||new Error("Scheduled action threw falsy error")}if(n)return this.unsubscribe(),i},t.prototype.unsubscribe=function(){if(!this.closed){var r=this,o=r.id,n=r.scheduler,i=n.actions;this.work=this.state=this.scheduler=null,this.pending=!1,qe(i,this),o!=null&&(this.id=this.recycleAsyncId(n,o,null)),this.delay=null,e.prototype.unsubscribe.call(this)}},t}(wo);var Ar=function(){function e(t,r){r===void 0&&(r=e.now),this.schedulerActionCtor=t,this.now=r}return e.prototype.schedule=function(t,r,o){return r===void 0&&(r=0),new this.schedulerActionCtor(this,t).schedule(o,r)},e.now=Lt.now,e}();var gt=function(e){re(t,e);function t(r,o){o===void 0&&(o=Ar.now);var n=e.call(this,r,o)||this;return n.actions=[],n._active=!1,n}return t.prototype.flush=function(r){var o=this.actions;if(this._active){o.push(r);return}var n;this._active=!0;do if(n=r.execute(r.state,r.delay))break;while(r=o.shift());if(this._active=!1,n){for(;r=o.shift();)r.unsubscribe();throw n}},t}(Ar);var se=new gt(vt),Cr=se;var To=function(e){re(t,e);function t(r,o){var n=e.call(this,r,o)||this;return n.scheduler=r,n.work=o,n}return t.prototype.schedule=function(r,o){return o===void 0&&(o=0),o>0?e.prototype.schedule.call(this,r,o):(this.delay=o,this.state=r,this.scheduler.flush(this),this)},t.prototype.execute=function(r,o){return o>0||this.closed?e.prototype.execute.call(this,r,o):this._execute(r,o)},t.prototype.requestAsyncId=function(r,o,n){return n===void 0&&(n=0),n!=null&&n>0||n==null&&this.delay>0?e.prototype.requestAsyncId.call(this,r,o,n):(r.flush(this),0)},t}(vt);var So=function(e){re(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t}(gt);var Hr=new So(To);var Oo=function(e){re(t,e);function t(r,o){var n=e.call(this,r,o)||this;return n.scheduler=r,n.work=o,n}return t.prototype.requestAsyncId=function(r,o,n){return n===void 0&&(n=0),n!==null&&n>0?e.prototype.requestAsyncId.call(this,r,o,n):(r.actions.push(this),r._scheduled||(r._scheduled=bt.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,o,n){var i;if(n===void 0&&(n=0),n!=null?n>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,o,n);var a=r.actions;o!=null&&((i=a[a.length-1])===null||i===void 0?void 0:i.id)!==o&&(bt.cancelAnimationFrame(o),r._scheduled=void 0)},t}(vt);var Mo=function(e){re(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var o=this._scheduled;this._scheduled=void 0;var n=this.actions,i;r=r||n.shift();do if(i=r.execute(r.state,r.delay))break;while((r=n[0])&&r.id===o&&n.shift());if(this._active=!1,i){for(;(r=n[0])&&r.id===o&&n.shift();)r.unsubscribe();throw i}},t}(gt);var me=new Mo(Oo);var O=new F(function(e){return e.complete()});function Yt(e){return e&&k(e.schedule)}function kr(e){return e[e.length-1]}function Xe(e){return k(kr(e))?e.pop():void 0}function He(e){return Yt(kr(e))?e.pop():void 0}function Bt(e,t){return typeof kr(e)=="number"?e.pop():t}var xt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Gt(e){return k(e==null?void 0:e.then)}function Jt(e){return k(e[ht])}function Xt(e){return Symbol.asyncIterator&&k(e==null?void 0:e[Symbol.asyncIterator])}function Zt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function Gi(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var er=Gi();function tr(e){return k(e==null?void 0:e[er])}function rr(e){return lo(this,arguments,function(){var r,o,n,i;return Nt(this,function(a){switch(a.label){case 0:r=e.getReader(),a.label=1;case 1:a.trys.push([1,,9,10]),a.label=2;case 2:return[4,nt(r.read())];case 3:return o=a.sent(),n=o.value,i=o.done,i?[4,nt(void 0)]:[3,5];case 4:return[2,a.sent()];case 5:return[4,nt(n)];case 6:return[4,a.sent()];case 7:return a.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function or(e){return k(e==null?void 0:e.getReader)}function W(e){if(e instanceof F)return e;if(e!=null){if(Jt(e))return Ji(e);if(xt(e))return Xi(e);if(Gt(e))return Zi(e);if(Xt(e))return Lo(e);if(tr(e))return ea(e);if(or(e))return ta(e)}throw Zt(e)}function Ji(e){return new F(function(t){var r=e[ht]();if(k(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function Xi(e){return new F(function(t){for(var r=0;r<e.length&&!t.closed;r++)t.next(e[r]);t.complete()})}function Zi(e){return new F(function(t){e.then(function(r){t.closed||(t.next(r),t.complete())},function(r){return t.error(r)}).then(null,Qt)})}function ea(e){return new F(function(t){var r,o;try{for(var n=de(e),i=n.next();!i.done;i=n.next()){var a=i.value;if(t.next(a),t.closed)return}}catch(s){r={error:s}}finally{try{i&&!i.done&&(o=n.return)&&o.call(n)}finally{if(r)throw r.error}}t.complete()})}function Lo(e){return new F(function(t){ra(e,t).catch(function(r){return t.error(r)})})}function ta(e){return Lo(rr(e))}function ra(e,t){var r,o,n,i;return po(this,void 0,void 0,function(){var a,s;return Nt(this,function(p){switch(p.label){case 0:p.trys.push([0,5,6,11]),r=mo(e),p.label=1;case 1:return[4,r.next()];case 2:if(o=p.sent(),!!o.done)return[3,4];if(a=o.value,t.next(a),t.closed)return[2];p.label=3;case 3:return[3,1];case 4:return[3,11];case 5:return s=p.sent(),n={error:s},[3,11];case 6:return p.trys.push([6,,9,10]),o&&!o.done&&(i=r.return)?[4,i.call(r)]:[3,8];case 7:p.sent(),p.label=8;case 8:return[3,10];case 9:if(n)throw n.error;return[7];case 10:return[7];case 11:return t.complete(),[2]}})})}function we(e,t,r,o,n){o===void 0&&(o=0),n===void 0&&(n=!1);var i=t.schedule(function(){r(),n?e.add(this.schedule(null,o)):this.unsubscribe()},o);if(e.add(i),!n)return i}function be(e,t){return t===void 0&&(t=0),y(function(r,o){r.subscribe(T(o,function(n){return we(o,e,function(){return o.next(n)},t)},function(){return we(o,e,function(){return o.complete()},t)},function(n){return we(o,e,function(){return o.error(n)},t)}))})}function Qe(e,t){return t===void 0&&(t=0),y(function(r,o){o.add(e.schedule(function(){return r.subscribe(o)},t))})}function _o(e,t){return W(e).pipe(Qe(t),be(t))}function Ao(e,t){return W(e).pipe(Qe(t),be(t))}function Co(e,t){return new F(function(r){var o=0;return t.schedule(function(){o===e.length?r.complete():(r.next(e[o++]),r.closed||this.schedule())})})}function Ho(e,t){return new F(function(r){var o;return we(r,t,function(){o=e[er](),we(r,t,function(){var n,i,a;try{n=o.next(),i=n.value,a=n.done}catch(s){r.error(s);return}a?r.complete():r.next(i)},0,!0)}),function(){return k(o==null?void 0:o.return)&&o.return()}})}function nr(e,t){if(!e)throw new Error("Iterable cannot be null");return new F(function(r){we(r,t,function(){var o=e[Symbol.asyncIterator]();we(r,t,function(){o.next().then(function(n){n.done?r.complete():r.next(n.value)})},0,!0)})})}function ko(e,t){return nr(rr(e),t)}function $o(e,t){if(e!=null){if(Jt(e))return _o(e,t);if(xt(e))return Co(e,t);if(Gt(e))return Ao(e,t);if(Xt(e))return nr(e,t);if(tr(e))return Ho(e,t);if(or(e))return ko(e,t)}throw Zt(e)}function ue(e,t){return t?$o(e,t):W(e)}function I(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];var r=He(e);return ue(e,r)}function $r(e,t){var r=k(e)?e:function(){return e},o=function(n){return n.error(r())};return new F(t?function(n){return t.schedule(o,0,n)}:o)}var ir=ft(function(e){return function(){e(this),this.name="EmptyError",this.message="no elements in sequence"}});function Po(e){return e instanceof Date&&!isNaN(e)}function m(e,t){return y(function(r,o){var n=0;r.subscribe(T(o,function(i){o.next(e.call(t,i,n++))}))})}var oa=Array.isArray;function na(e,t){return oa(t)?e.apply(void 0,q([],N(t))):e(t)}function Ze(e){return m(function(t){return na(e,t)})}var ia=Array.isArray,aa=Object.getPrototypeOf,sa=Object.prototype,ca=Object.keys;function Ro(e){if(e.length===1){var t=e[0];if(ia(t))return{args:t,keys:null};if(pa(t)){var r=ca(t);return{args:r.map(function(o){return t[o]}),keys:r}}}return{args:e,keys:null}}function pa(e){return e&&typeof e=="object"&&aa(e)===sa}function Io(e,t){return e.reduce(function(r,o,n){return r[o]=t[n],r},{})}function z(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];var r=He(e),o=Xe(e),n=Ro(e),i=n.args,a=n.keys;if(i.length===0)return ue([],r);var s=new F(Pr(i,r,a?function(p){return Io(a,p)}:le));return o?s.pipe(Ze(o)):s}function Pr(e,t,r){return r===void 0&&(r=le),function(o){Fo(t,function(){for(var n=e.length,i=new Array(n),a=n,s=n,p=function(l){Fo(t,function(){var f=ue(e[l],t),u=!1;f.subscribe(T(o,function(h){i[l]=h,u||(u=!0,s--),s||o.next(r(i.slice()))},function(){--a||o.complete()}))},o)},c=0;c<n;c++)p(c)},o)}}function Fo(e,t,r){e?we(r,e,t):t()}function jo(e,t,r,o,n,i,a,s){var p=[],c=0,l=0,f=!1,u=function(){f&&!p.length&&!c&&t.complete()},h=function(A){return c<o?w(A):p.push(A)},w=function(A){i&&t.next(A),c++;var te=!1;W(r(A,l++)).subscribe(T(t,function(ie){n==null||n(ie),i?h(ie):t.next(ie)},function(){te=!0},void 0,function(){if(te)try{c--;for(var ie=function(){var J=p.shift();a?we(t,a,function(){return w(J)}):w(J)};p.length&&c<o;)ie();u()}catch(J){t.error(J)}}))};return e.subscribe(T(t,h,function(){f=!0,u()})),function(){s==null||s()}}function oe(e,t,r){return r===void 0&&(r=1/0),k(t)?oe(function(o,n){return m(function(i,a){return t(o,i,n,a)})(W(e(o,n)))},r):(typeof t=="number"&&(r=t),y(function(o,n){return jo(o,n,e,r)}))}function yt(e){return e===void 0&&(e=1/0),oe(le,e)}function Wo(){return yt(1)}function je(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];return Wo()(ue(e,He(e)))}function C(e){return new F(function(t){W(e()).subscribe(t)})}var la=["addListener","removeListener"],ma=["addEventListener","removeEventListener"],fa=["on","off"];function d(e,t,r,o){if(k(r)&&(o=r,r=void 0),o)return d(e,t,r).pipe(Ze(o));var n=N(ha(e)?ma.map(function(s){return function(p){return e[s](t,p,r)}}):ua(e)?la.map(Uo(e,t)):da(e)?fa.map(Uo(e,t)):[],2),i=n[0],a=n[1];if(!i&&xt(e))return oe(function(s){return d(s,t,r)})(W(e));if(!i)throw new TypeError("Invalid event target");return new F(function(s){var p=function(){for(var c=[],l=0;l<arguments.length;l++)c[l]=arguments[l];return s.next(1<c.length?c:c[0])};return i(p),function(){return a(p)}})}function Uo(e,t){return function(r){return function(o){return e[r](t,o)}}}function ua(e){return k(e.addListener)&&k(e.removeListener)}function da(e){return k(e.on)&&k(e.off)}function ha(e){return k(e.addEventListener)&&k(e.removeEventListener)}function ar(e,t,r){return r?ar(e,t).pipe(Ze(r)):new F(function(o){var n=function(){for(var a=[],s=0;s<arguments.length;s++)a[s]=arguments[s];return o.next(a.length===1?a[0]:a)},i=e(n);return k(t)?function(){return t(n,i)}:void 0})}function Me(e,t,r){e===void 0&&(e=0),r===void 0&&(r=Cr);var o=-1;return t!=null&&(Yt(t)?r=t:o=t),new F(function(n){var i=Po(e)?+e-r.now():e;i<0&&(i=0);var a=0;return r.schedule(function(){n.closed||(n.next(a++),0<=o?this.schedule(void 0,o):n.complete())},i)})}function S(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];var r=He(e),o=Bt(e,1/0),n=e;return n.length?n.length===1?W(n[0]):yt(o)(ue(n,r)):O}var Ke=new F(he);var ba=Array.isArray;function Et(e){return e.length===1&&ba(e[0])?e[0]:e}function b(e,t){return y(function(r,o){var n=0;r.subscribe(T(o,function(i){return e.call(t,i,n++)&&o.next(i)}))})}function Ct(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];var r=Xe(e),o=Et(e);return o.length?new F(function(n){var i=o.map(function(){return[]}),a=o.map(function(){return!1});n.add(function(){i=a=null});for(var s=function(c){W(o[c]).subscribe(T(n,function(l){if(i[c].push(l),i.every(function(u){return u.length})){var f=i.map(function(u){return u.shift()});n.next(r?r.apply(void 0,q([],N(f))):f),i.some(function(u,h){return!u.length&&a[h]})&&n.complete()}},function(){a[c]=!0,!i[c].length&&n.complete()}))},p=0;!n.closed&&p<o.length;p++)s(p);return function(){i=a=null}}):O}function Do(e){return y(function(t,r){var o=!1,n=null,i=null,a=!1,s=function(){if(i==null||i.unsubscribe(),i=null,o){o=!1;var c=n;n=null,r.next(c)}a&&r.complete()},p=function(){i=null,a&&r.complete()};t.subscribe(T(r,function(c){o=!0,n=c,i||W(e(c)).subscribe(i=T(r,s,p))},function(){a=!0,(!o||!i||i.closed)&&r.complete()}))})}function Le(e,t){return t===void 0&&(t=se),Do(function(){return Me(e,t)})}function Ye(e,t){return t===void 0&&(t=null),t=t!=null?t:e,y(function(r,o){var n=[],i=0;r.subscribe(T(o,function(a){var s,p,c,l,f=null;i++%t===0&&n.push([]);try{for(var u=de(n),h=u.next();!h.done;h=u.next()){var w=h.value;w.push(a),e<=w.length&&(f=f!=null?f:[],f.push(w))}}catch(ie){s={error:ie}}finally{try{h&&!h.done&&(p=u.return)&&p.call(u)}finally{if(s)throw s.error}}if(f)try{for(var A=de(f),te=A.next();!te.done;te=A.next()){var w=te.value;qe(n,w),o.next(w)}}catch(ie){c={error:ie}}finally{try{te&&!te.done&&(l=A.return)&&l.call(A)}finally{if(c)throw c.error}}},function(){var a,s;try{for(var p=de(n),c=p.next();!c.done;c=p.next()){var l=c.value;o.next(l)}}catch(f){a={error:f}}finally{try{c&&!c.done&&(s=p.return)&&s.call(p)}finally{if(a)throw a.error}}o.complete()},void 0,function(){n=null}))})}function ve(e){return y(function(t,r){var o=null,n=!1,i;o=t.subscribe(T(r,void 0,void 0,function(a){i=W(e(a,ve(e)(t))),o?(o.unsubscribe(),o=null,i.subscribe(r)):n=!0})),n&&(o.unsubscribe(),o=null,i.subscribe(r))})}function Vo(e,t,r,o,n){return function(i,a){var s=r,p=t,c=0;i.subscribe(T(a,function(l){var f=c++;p=s?e(p,l,f):(s=!0,l),o&&a.next(p)},n&&function(){s&&a.next(p),a.complete()}))}}function Rr(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];var r=Xe(e);return r?go(Rr.apply(void 0,q([],N(e))),Ze(r)):y(function(o,n){Pr(q([o],N(Et(e))))(n)})}function We(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];return Rr.apply(void 0,q([],N(e)))}function Ht(e){return y(function(t,r){var o=!1,n=null,i=null,a=function(){if(i==null||i.unsubscribe(),i=null,o){o=!1;var s=n;n=null,r.next(s)}};t.subscribe(T(r,function(s){i==null||i.unsubscribe(),o=!0,n=s,i=T(r,a,he),W(e(s)).subscribe(i)},function(){a(),r.complete()},void 0,function(){n=i=null}))})}function _e(e,t){return t===void 0&&(t=se),y(function(r,o){var n=null,i=null,a=null,s=function(){if(n){n.unsubscribe(),n=null;var c=i;i=null,o.next(c)}};function p(){var c=a+e,l=t.now();if(l<c){n=this.schedule(void 0,c-l),o.add(n);return}s()}r.subscribe(T(o,function(c){i=c,a=t.now(),n||(n=t.schedule(p,e),o.add(n))},function(){s(),o.complete()},void 0,function(){i=n=null}))})}function Be(e){return y(function(t,r){var o=!1;t.subscribe(T(r,function(n){o=!0,r.next(n)},function(){o||r.next(e),r.complete()}))})}function Te(e){return e<=0?function(){return O}:y(function(t,r){var o=0;t.subscribe(T(r,function(n){++o<=e&&(r.next(n),e<=o&&r.complete())}))})}function X(){return y(function(e,t){e.subscribe(T(t,he))})}function No(e){return m(function(){return e})}function Ir(e,t){return t?function(r){return je(t.pipe(Te(1),X()),r.pipe(Ir(e)))}:oe(function(r,o){return W(e(r,o)).pipe(Te(1),No(r))})}function Ge(e,t){t===void 0&&(t=se);var r=Me(e,t);return Ir(function(){return r})}function K(e,t){return t===void 0&&(t=le),e=e!=null?e:va,y(function(r,o){var n,i=!0;r.subscribe(T(o,function(a){var s=t(a);(i||!e(n,s))&&(i=!1,n=s,o.next(a))}))})}function va(e,t){return e===t}function Z(e,t){return K(function(r,o){return t?t(r[e],o[e]):r[e]===o[e]})}function zo(e){return e===void 0&&(e=ga),y(function(t,r){var o=!1;t.subscribe(T(r,function(n){o=!0,r.next(n)},function(){return o?r.complete():r.error(e())}))})}function ga(){return new ir}function ne(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];return function(r){return je(r,I.apply(void 0,q([],N(e))))}}function L(e){return y(function(t,r){try{t.subscribe(r)}finally{r.add(e)}})}function Ae(e,t){var r=arguments.length>=2;return function(o){return o.pipe(e?b(function(n,i){return e(n,i,o)}):le,Te(1),r?Be(t):zo(function(){return new ir}))}}function Fr(e){return e<=0?function(){return O}:y(function(t,r){var o=[];t.subscribe(T(r,function(n){o.push(n),e<o.length&&o.shift()},function(){var n,i;try{for(var a=de(o),s=a.next();!s.done;s=a.next()){var p=s.value;r.next(p)}}catch(c){n={error:c}}finally{try{s&&!s.done&&(i=a.return)&&i.call(a)}finally{if(n)throw n.error}}r.complete()},void 0,function(){o=null}))})}function qo(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];var r=He(e),o=Bt(e,1/0);return e=Et(e),y(function(n,i){yt(o)(ue(q([n],N(e)),r)).subscribe(i)})}function Pe(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];return qo.apply(void 0,q([],N(e)))}function st(e){var t,r=1/0,o;return e!=null&&(typeof e=="object"?(t=e.count,r=t===void 0?1/0:t,o=e.delay):r=e),r<=0?function(){return O}:y(function(n,i){var a=0,s,p=function(){if(s==null||s.unsubscribe(),s=null,o!=null){var l=typeof o=="number"?Me(o):W(o(a)),f=T(i,function(){f.unsubscribe(),c()});l.subscribe(f)}else c()},c=function(){var l=!1;s=n.subscribe(T(i,void 0,function(){++a<r?s?p():l=!0:i.complete()})),l&&p()};c()})}function jr(e,t){return y(Vo(e,t,arguments.length>=2,!0))}function pe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new g}:t,o=e.resetOnError,n=o===void 0?!0:o,i=e.resetOnComplete,a=i===void 0?!0:i,s=e.resetOnRefCountZero,p=s===void 0?!0:s;return function(c){var l,f,u,h=0,w=!1,A=!1,te=function(){f==null||f.unsubscribe(),f=void 0},ie=function(){te(),l=u=void 0,w=A=!1},J=function(){var H=l;ie(),H==null||H.unsubscribe()};return y(function(H,mt){h++,!A&&!w&&te();var ze=u=u!=null?u:r();mt.add(function(){h--,h===0&&!A&&!w&&(f=Wr(J,p))}),ze.subscribe(mt),!l&&h>0&&(l=new at({next:function(Ie){return ze.next(Ie)},error:function(Ie){A=!0,te(),f=Wr(ie,n,Ie),ze.error(Ie)},complete:function(){w=!0,te(),f=Wr(ie,a),ze.complete()}}),W(H).subscribe(l))})(c)}}function Wr(e,t){for(var r=[],o=2;o<arguments.length;o++)r[o-2]=arguments[o];if(t===!0){e();return}if(t!==!1){var n=new at({next:function(){n.unsubscribe(),e()}});return W(t.apply(void 0,q([],N(r)))).subscribe(n)}}function G(e,t,r){var o,n,i,a,s=!1;return e&&typeof e=="object"?(o=e.bufferSize,a=o===void 0?1/0:o,n=e.windowTime,t=n===void 0?1/0:n,i=e.refCount,s=i===void 0?!1:i,r=e.scheduler):a=e!=null?e:1/0,pe({connector:function(){return new _t(a,t,r)},resetOnError:!0,resetOnComplete:!1,resetOnRefCountZero:s})}function Ce(e){return b(function(t,r){return e<=r})}function Ur(e){return y(function(t,r){var o=!1,n=T(r,function(){n==null||n.unsubscribe(),o=!0},he);W(e).subscribe(n),t.subscribe(T(r,function(i){return o&&r.next(i)}))})}function Q(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];var r=He(e);return y(function(o,n){(r?je(e,o,r):je(e,o)).subscribe(n)})}function v(e,t){return y(function(r,o){var n=null,i=0,a=!1,s=function(){return a&&!n&&o.complete()};r.subscribe(T(o,function(p){n==null||n.unsubscribe();var c=0,l=i++;W(e(p,l)).subscribe(n=T(o,function(f){return o.next(t?t(p,f,l,c++):f)},function(){n=null,s()}))},function(){a=!0,s()}))})}function U(e){return y(function(t,r){W(e).subscribe(T(r,function(){return r.complete()},he)),!r.closed&&t.subscribe(r)})}function Dr(e,t){return t===void 0&&(t=!1),y(function(r,o){var n=0;r.subscribe(T(o,function(i){var a=e(i,n++);(a||t)&&o.next(i),!a&&o.complete()}))})}function E(e,t,r){var o=k(e)||t||r?{next:e,error:t,complete:r}:e;return o?y(function(n,i){var a;(a=o.subscribe)===null||a===void 0||a.call(o);var s=!0;n.subscribe(T(i,function(p){var c;(c=o.next)===null||c===void 0||c.call(o,p),i.next(p)},function(){var p;s=!1,(p=o.complete)===null||p===void 0||p.call(o),i.complete()},function(p){var c;s=!1,(c=o.error)===null||c===void 0||c.call(o,p),i.error(p)},function(){var p,c;s&&((p=o.unsubscribe)===null||p===void 0||p.call(o)),(c=o.finalize)===null||c===void 0||c.call(o)}))}):le}function Qo(e,t){return y(function(r,o){var n=t!=null?t:{},i=n.leading,a=i===void 0?!0:i,s=n.trailing,p=s===void 0?!1:s,c=!1,l=null,f=null,u=!1,h=function(){f==null||f.unsubscribe(),f=null,p&&(te(),u&&o.complete())},w=function(){f=null,u&&o.complete()},A=function(ie){return f=W(e(ie)).subscribe(T(o,h,w))},te=function(){if(c){c=!1;var ie=l;l=null,o.next(ie),!u&&A(ie)}};r.subscribe(T(o,function(ie){c=!0,l=ie,!(f&&!f.closed)&&(a?te():A(ie))},function(){u=!0,!(p&&c&&f&&!f.closed)&&o.complete()}))})}function ct(e,t,r){t===void 0&&(t=se);var o=Me(e,t);return Qo(function(){return o},r)}function ee(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];var r=Xe(e);return y(function(o,n){for(var i=e.length,a=new Array(i),s=e.map(function(){return!1}),p=!1,c=function(f){W(e[f]).subscribe(T(n,function(u){a[f]=u,!p&&!s[f]&&(s[f]=!0,(p=s.every(le))&&(s=null))},he))},l=0;l<i;l++)c(l);o.subscribe(T(n,function(f){if(p){var u=q([f],N(a));n.next(r?r.apply(void 0,q([],N(u))):u)}}))})}function Ko(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];return y(function(r,o){Ct.apply(void 0,q([r],N(e))).subscribe(o)})}function Vr(){for(var e=[],t=0;t<arguments.length;t++)e[t]=arguments[t];return Ko.apply(void 0,q([],N(e)))}function Yo(){let e=new _t(1);return d(document,"DOMContentLoaded",{once:!0}).subscribe(()=>e.next(document)),e}function $(e,t=document){return Array.from(t.querySelectorAll(e))}function P(e,t=document){let r=fe(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function fe(e,t=document){return t.querySelector(e)||void 0}function Re(){var e,t,r,o;return(o=(r=(t=(e=document.activeElement)==null?void 0:e.shadowRoot)==null?void 0:t.activeElement)!=null?r:document.activeElement)!=null?o:void 0}var xa=S(d(document.body,"focusin"),d(document.body,"focusout")).pipe(_e(1),Q(void 0),m(()=>Re()||document.body),G(1));function et(e){return xa.pipe(m(t=>e.contains(t)),K())}function kt(e,t){return C(()=>S(d(e,"mouseenter").pipe(m(()=>!0)),d(e,"mouseleave").pipe(m(()=>!1))).pipe(t?Ht(r=>Me(+!r*t)):le,Q(e.matches(":hover"))))}function Bo(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)Bo(e,r)}function x(e,t,...r){let o=document.createElement(e);if(t)for(let n of Object.keys(t))typeof t[n]!="undefined"&&(typeof t[n]!="boolean"?o.setAttribute(n,t[n]):o.setAttribute(n,""));for(let n of r)Bo(o,n);return o}function sr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function wt(e){let t=x("script",{src:e});return C(()=>(document.head.appendChild(t),S(d(t,"load"),d(t,"error").pipe(v(()=>$r(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(m(()=>{}),L(()=>document.head.removeChild(t)),Te(1))))}var Go=new g,ya=C(()=>typeof ResizeObserver=="undefined"?wt("https://unpkg.com/resize-observer-polyfill"):I(void 0)).pipe(m(()=>new ResizeObserver(e=>e.forEach(t=>Go.next(t)))),v(e=>S(Ke,I(e)).pipe(L(()=>e.disconnect()))),G(1));function ce(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ge(e){let t=e;for(;t.clientWidth===0&&t.parentElement;)t=t.parentElement;return ya.pipe(E(r=>r.observe(t)),v(r=>Go.pipe(b(o=>o.target===t),L(()=>r.unobserve(t)))),m(()=>ce(e)),Q(ce(e)))}function Tt(e){return{width:e.scrollWidth,height:e.scrollHeight}}function cr(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}function Jo(e){let t=[],r=e.parentElement;for(;r;)(e.clientWidth>r.clientWidth||e.clientHeight>r.clientHeight)&&t.push(r),r=(e=r).parentElement;return t.length===0&&t.push(document.documentElement),t}function Ue(e){return{x:e.offsetLeft,y:e.offsetTop}}function Xo(e){let t=e.getBoundingClientRect();return{x:t.x+window.scrollX,y:t.y+window.scrollY}}function Zo(e){return S(d(window,"load"),d(window,"resize")).pipe(Le(0,me),m(()=>Ue(e)),Q(Ue(e)))}function pr(e){return{x:e.scrollLeft,y:e.scrollTop}}function De(e){return S(d(e,"scroll"),d(window,"scroll"),d(window,"resize")).pipe(Le(0,me),m(()=>pr(e)),Q(pr(e)))}var en=new g,Ea=C(()=>I(new IntersectionObserver(e=>{for(let t of e)en.next(t)},{threshold:0}))).pipe(v(e=>S(Ke,I(e)).pipe(L(()=>e.disconnect()))),G(1));function tt(e){return Ea.pipe(E(t=>t.observe(e)),v(t=>en.pipe(b(({target:r})=>r===e),L(()=>t.unobserve(e)),m(({isIntersecting:r})=>r))))}function tn(e,t=16){return De(e).pipe(m(({y:r})=>{let o=ce(e),n=Tt(e);return r>=n.height-o.height-t}),K())}var lr={drawer:P("[data-md-toggle=drawer]"),search:P("[data-md-toggle=search]")};function rn(e){return lr[e].checked}function Je(e,t){lr[e].checked!==t&&lr[e].click()}function Ve(e){let t=lr[e];return d(t,"change").pipe(m(()=>t.checked),Q(t.checked))}function wa(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function Ta(){return S(d(window,"compositionstart").pipe(m(()=>!0)),d(window,"compositionend").pipe(m(()=>!1))).pipe(Q(!1))}function on(){let e=d(window,"keydown").pipe(b(t=>!(t.metaKey||t.ctrlKey)),m(t=>({mode:rn("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),b(({mode:t,type:r})=>{if(t==="global"){let o=Re();if(typeof o!="undefined")return!wa(o,r)}return!0}),pe());return Ta().pipe(v(t=>t?O:e))}function xe(){return new URL(location.href)}function pt(e,t=!1){if(B("navigation.instant")&&!t){let r=x("a",{href:e.href});document.body.appendChild(r),r.click(),r.remove()}else location.href=e.href}function nn(){return new g}function an(){return location.hash.slice(1)}function sn(e){let t=x("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function Sa(e){return S(d(window,"hashchange"),e).pipe(m(an),Q(an()),b(t=>t.length>0),G(1))}function cn(e){return Sa(e).pipe(m(t=>fe(`[id="${t}"]`)),b(t=>typeof t!="undefined"))}function $t(e){let t=matchMedia(e);return ar(r=>t.addListener(()=>r(t.matches))).pipe(Q(t.matches))}function pn(){let e=matchMedia("print");return S(d(window,"beforeprint").pipe(m(()=>!0)),d(window,"afterprint").pipe(m(()=>!1))).pipe(Q(e.matches))}function Nr(e,t){return e.pipe(v(r=>r?t():O))}function zr(e,t){return new F(r=>{let o=new XMLHttpRequest;return o.open("GET",`${e}`),o.responseType="blob",o.addEventListener("load",()=>{o.status>=200&&o.status<300?(r.next(o.response),r.complete()):r.error(new Error(o.statusText))}),o.addEventListener("error",()=>{r.error(new Error("Network error"))}),o.addEventListener("abort",()=>{r.complete()}),typeof(t==null?void 0:t.progress$)!="undefined"&&(o.addEventListener("progress",n=>{var i;if(n.lengthComputable)t.progress$.next(n.loaded/n.total*100);else{let a=(i=o.getResponseHeader("Content-Length"))!=null?i:0;t.progress$.next(n.loaded/+a*100)}}),t.progress$.next(5)),o.send(),()=>o.abort()})}function Ne(e,t){return zr(e,t).pipe(v(r=>r.text()),m(r=>JSON.parse(r)),G(1))}function ln(e,t){let r=new DOMParser;return zr(e,t).pipe(v(o=>o.text()),m(o=>r.parseFromString(o,"text/html")),G(1))}function mn(e,t){let r=new DOMParser;return zr(e,t).pipe(v(o=>o.text()),m(o=>r.parseFromString(o,"text/xml")),G(1))}function fn(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function un(){return S(d(window,"scroll",{passive:!0}),d(window,"resize",{passive:!0})).pipe(m(fn),Q(fn()))}function dn(){return{width:innerWidth,height:innerHeight}}function hn(){return d(window,"resize",{passive:!0}).pipe(m(dn),Q(dn()))}function bn(){return z([un(),hn()]).pipe(m(([e,t])=>({offset:e,size:t})),G(1))}function mr(e,{viewport$:t,header$:r}){let o=t.pipe(Z("size")),n=z([o,r]).pipe(m(()=>Ue(e)));return z([r,t,n]).pipe(m(([{height:i},{offset:a,size:s},{x:p,y:c}])=>({offset:{x:a.x-p,y:a.y-c+i},size:s})))}function Oa(e){return d(e,"message",t=>t.data)}function Ma(e){let t=new g;return t.subscribe(r=>e.postMessage(r)),t}function vn(e,t=new Worker(e)){let r=Oa(t),o=Ma(t),n=new g;n.subscribe(o);let i=o.pipe(X(),ne(!0));return n.pipe(X(),Pe(r.pipe(U(i))),pe())}var La=P("#__config"),St=JSON.parse(La.textContent);St.base=`${new URL(St.base,xe())}`;function ye(){return St}function B(e){return St.features.includes(e)}function Ee(e,t){return typeof t!="undefined"?St.translations[e].replace("#",t.toString()):St.translations[e]}function Se(e,t=document){return P(`[data-md-component=${e}]`,t)}function ae(e,t=document){return $(`[data-md-component=${e}]`,t)}function _a(e){let t=P(".md-typeset > :first-child",e);return d(t,"click",{once:!0}).pipe(m(()=>P(".md-typeset",e)),m(r=>({hash:__md_hash(r.innerHTML)})))}function gn(e){if(!B("announce.dismiss")||!e.childElementCount)return O;if(!e.hidden){let t=P(".md-typeset",e);__md_hash(t.innerHTML)===__md_get("__announce")&&(e.hidden=!0)}return C(()=>{let t=new g;return t.subscribe(({hash:r})=>{e.hidden=!0,__md_set("__announce",r)}),_a(e).pipe(E(r=>t.next(r)),L(()=>t.complete()),m(r=>R({ref:e},r)))})}function Aa(e,{target$:t}){return t.pipe(m(r=>({hidden:r!==e})))}function xn(e,t){let r=new g;return r.subscribe(({hidden:o})=>{e.hidden=o}),Aa(e,t).pipe(E(o=>r.next(o)),L(()=>r.complete()),m(o=>R({ref:e},o)))}function Pt(e,t){return t==="inline"?x("div",{class:"md-tooltip md-tooltip--inline",id:e,role:"tooltip"},x("div",{class:"md-tooltip__inner md-typeset"})):x("div",{class:"md-tooltip",id:e,role:"tooltip"},x("div",{class:"md-tooltip__inner md-typeset"}))}function yn(...e){return x("div",{class:"md-tooltip2",role:"tooltip"},x("div",{class:"md-tooltip2__inner md-typeset"},e))}function En(e,t){if(t=t?`${t}_annotation_${e}`:void 0,t){let r=t?`#${t}`:void 0;return x("aside",{class:"md-annotation",tabIndex:0},Pt(t),x("a",{href:r,class:"md-annotation__index",tabIndex:-1},x("span",{"data-md-annotation-id":e})))}else return x("aside",{class:"md-annotation",tabIndex:0},Pt(t),x("span",{class:"md-annotation__index",tabIndex:-1},x("span",{"data-md-annotation-id":e})))}function wn(e){return x("button",{class:"md-clipboard md-icon",title:Ee("clipboard.copy"),"data-clipboard-target":`#${e} > code`})}function qr(e,t){let r=t&2,o=t&1,n=Object.keys(e.terms).filter(p=>!e.terms[p]).reduce((p,c)=>[...p,x("del",null,c)," "],[]).slice(0,-1),i=ye(),a=new URL(e.location,i.base);B("search.highlight")&&a.searchParams.set("h",Object.entries(e.terms).filter(([,p])=>p).reduce((p,[c])=>`${p} ${c}`.trim(),""));let{tags:s}=ye();return x("a",{href:`${a}`,class:"md-search-result__link",tabIndex:-1},x("article",{class:"md-search-result__article md-typeset","data-md-score":e.score.toFixed(2)},r>0&&x("div",{class:"md-search-result__icon md-icon"}),r>0&&x("h1",null,e.title),r<=0&&x("h2",null,e.title),o>0&&e.text.length>0&&e.text,e.tags&&e.tags.map(p=>{let c=s?p in s?`md-tag-icon md-tag--${s[p]}`:"md-tag-icon":"";return x("span",{class:`md-tag ${c}`},p)}),o>0&&n.length>0&&x("p",{class:"md-search-result__terms"},Ee("search.result.term.missing"),": ",...n)))}function Tn(e){let t=e[0].score,r=[...e],o=ye(),n=r.findIndex(l=>!`${new URL(l.location,o.base)}`.includes("#")),[i]=r.splice(n,1),a=r.findIndex(l=>l.score<t);a===-1&&(a=r.length);let s=r.slice(0,a),p=r.slice(a),c=[qr(i,2|+(!n&&a===0)),...s.map(l=>qr(l,1)),...p.length?[x("details",{class:"md-search-result__more"},x("summary",{tabIndex:-1},x("div",null,p.length>0&&p.length===1?Ee("search.result.more.one"):Ee("search.result.more.other",p.length))),...p.map(l=>qr(l,1)))]:[]];return x("li",{class:"md-search-result__item"},c)}function Sn(e){return x("ul",{class:"md-source__facts"},Object.entries(e).map(([t,r])=>x("li",{class:`md-source__fact md-source__fact--${t}`},typeof r=="number"?sr(r):r)))}function Qr(e){let t=`tabbed-control tabbed-control--${e}`;return x("div",{class:t,hidden:!0},x("button",{class:"tabbed-button",tabIndex:-1,"aria-hidden":"true"}))}function On(e){return x("div",{class:"md-typeset__scrollwrap"},x("div",{class:"md-typeset__table"},e))}function Ca(e){var o;let t=ye(),r=new URL(`../${e.version}/`,t.base);return x("li",{class:"md-version__item"},x("a",{href:`${r}`,class:"md-version__link"},e.title,((o=t.version)==null?void 0:o.alias)&&e.aliases.length>0&&x("span",{class:"md-version__alias"},e.aliases[0])))}function Mn(e,t){var o;let r=ye();return e=e.filter(n=>{var i;return!((i=n.properties)!=null&&i.hidden)}),x("div",{class:"md-version"},x("button",{class:"md-version__current","aria-label":Ee("select.version")},t.title,((o=r.version)==null?void 0:o.alias)&&t.aliases.length>0&&x("span",{class:"md-version__alias"},t.aliases[0])),x("ul",{class:"md-version__list"},e.map(Ca)))}var Ha=0;function ka(e){let t=z([et(e),kt(e)]).pipe(m(([o,n])=>o||n),K()),r=C(()=>Jo(e)).pipe(oe(De),ct(1),m(()=>Xo(e)));return t.pipe(Ae(o=>o),v(()=>z([t,r])),m(([o,n])=>({active:o,offset:n})),pe())}function $a(e,t){let{content$:r,viewport$:o}=t,n=`__tooltip2_${Ha++}`;return C(()=>{let i=new g,a=new _r(!1);i.pipe(X(),ne(!1)).subscribe(a);let s=a.pipe(Ht(c=>Me(+!c*250,Hr)),K(),v(c=>c?r:O),E(c=>c.id=n),pe());z([i.pipe(m(({active:c})=>c)),s.pipe(v(c=>kt(c,250)),Q(!1))]).pipe(m(c=>c.some(l=>l))).subscribe(a);let p=a.pipe(b(c=>c),ee(s,o),m(([c,l,{size:f}])=>{let u=e.getBoundingClientRect(),h=u.width/2;if(l.role==="tooltip")return{x:h,y:8+u.height};if(u.y>=f.height/2){let{height:w}=ce(l);return{x:h,y:-16-w}}else return{x:h,y:16+u.height}}));return z([s,i,p]).subscribe(([c,{offset:l},f])=>{c.style.setProperty("--md-tooltip-host-x",`${l.x}px`),c.style.setProperty("--md-tooltip-host-y",`${l.y}px`),c.style.setProperty("--md-tooltip-x",`${f.x}px`),c.style.setProperty("--md-tooltip-y",`${f.y}px`),c.classList.toggle("md-tooltip2--top",f.y<0),c.classList.toggle("md-tooltip2--bottom",f.y>=0)}),a.pipe(b(c=>c),ee(s,(c,l)=>l),b(c=>c.role==="tooltip")).subscribe(c=>{let l=ce(P(":scope > *",c));c.style.setProperty("--md-tooltip-width",`${l.width}px`),c.style.setProperty("--md-tooltip-tail","0px")}),a.pipe(K(),be(me),ee(s)).subscribe(([c,l])=>{l.classList.toggle("md-tooltip2--active",c)}),z([a.pipe(b(c=>c)),s]).subscribe(([c,l])=>{l.role==="dialog"?(e.setAttribute("aria-controls",n),e.setAttribute("aria-haspopup","dialog")):e.setAttribute("aria-describedby",n)}),a.pipe(b(c=>!c)).subscribe(()=>{e.removeAttribute("aria-controls"),e.removeAttribute("aria-describedby"),e.removeAttribute("aria-haspopup")}),ka(e).pipe(E(c=>i.next(c)),L(()=>i.complete()),m(c=>R({ref:e},c)))})}function lt(e,{viewport$:t},r=document.body){return $a(e,{content$:new F(o=>{let n=e.title,i=yn(n);return o.next(i),e.removeAttribute("title"),r.append(i),()=>{i.remove(),e.setAttribute("title",n)}}),viewport$:t})}function Pa(e,t){let r=C(()=>z([Zo(e),De(t)])).pipe(m(([{x:o,y:n},i])=>{let{width:a,height:s}=ce(e);return{x:o-i.x+a/2,y:n-i.y+s/2}}));return et(e).pipe(v(o=>r.pipe(m(n=>({active:o,offset:n})),Te(+!o||1/0))))}function Ln(e,t,{target$:r}){let[o,n]=Array.from(e.children);return C(()=>{let i=new g,a=i.pipe(X(),ne(!0));return i.subscribe({next({offset:s}){e.style.setProperty("--md-tooltip-x",`${s.x}px`),e.style.setProperty("--md-tooltip-y",`${s.y}px`)},complete(){e.style.removeProperty("--md-tooltip-x"),e.style.removeProperty("--md-tooltip-y")}}),tt(e).pipe(U(a)).subscribe(s=>{e.toggleAttribute("data-md-visible",s)}),S(i.pipe(b(({active:s})=>s)),i.pipe(_e(250),b(({active:s})=>!s))).subscribe({next({active:s}){s?e.prepend(o):o.remove()},complete(){e.prepend(o)}}),i.pipe(Le(16,me)).subscribe(({active:s})=>{o.classList.toggle("md-tooltip--active",s)}),i.pipe(ct(125,me),b(()=>!!e.offsetParent),m(()=>e.offsetParent.getBoundingClientRect()),m(({x:s})=>s)).subscribe({next(s){s?e.style.setProperty("--md-tooltip-0",`${-s}px`):e.style.removeProperty("--md-tooltip-0")},complete(){e.style.removeProperty("--md-tooltip-0")}}),d(n,"click").pipe(U(a),b(s=>!(s.metaKey||s.ctrlKey))).subscribe(s=>{s.stopPropagation(),s.preventDefault()}),d(n,"mousedown").pipe(U(a),ee(i)).subscribe(([s,{active:p}])=>{var c;if(s.button!==0||s.metaKey||s.ctrlKey)s.preventDefault();else if(p){s.preventDefault();let l=e.parentElement.closest(".md-annotation");l instanceof HTMLElement?l.focus():(c=Re())==null||c.blur()}}),r.pipe(U(a),b(s=>s===o),Ge(125)).subscribe(()=>e.focus()),Pa(e,t).pipe(E(s=>i.next(s)),L(()=>i.complete()),m(s=>R({ref:e},s)))})}function Ra(e){return e.tagName==="CODE"?$(".c, .c1, .cm",e):[e]}function Ia(e){let t=[];for(let r of Ra(e)){let o=[],n=document.createNodeIterator(r,NodeFilter.SHOW_TEXT);for(let i=n.nextNode();i;i=n.nextNode())o.push(i);for(let i of o){let a;for(;a=/(\(\d+\))(!)?/.exec(i.textContent);){let[,s,p]=a;if(typeof p=="undefined"){let c=i.splitText(a.index);i=c.splitText(s.length),t.push(c)}else{i.textContent=s,t.push(i);break}}}}return t}function _n(e,t){t.append(...Array.from(e.childNodes))}function fr(e,t,{target$:r,print$:o}){let n=t.closest("[id]"),i=n==null?void 0:n.id,a=new Map;for(let s of Ia(t)){let[,p]=s.textContent.match(/\((\d+)\)/);fe(`:scope > li:nth-child(${p})`,e)&&(a.set(p,En(p,i)),s.replaceWith(a.get(p)))}return a.size===0?O:C(()=>{let s=new g,p=s.pipe(X(),ne(!0)),c=[];for(let[l,f]of a)c.push([P(".md-typeset",f),P(`:scope > li:nth-child(${l})`,e)]);return o.pipe(U(p)).subscribe(l=>{e.hidden=!l,e.classList.toggle("md-annotation-list",l);for(let[f,u]of c)l?_n(f,u):_n(u,f)}),S(...[...a].map(([,l])=>Ln(l,t,{target$:r}))).pipe(L(()=>s.complete()),pe())})}function An(e){if(e.nextElementSibling){let t=e.nextElementSibling;if(t.tagName==="OL")return t;if(t.tagName==="P"&&!t.children.length)return An(t)}}function Cn(e,t){return C(()=>{let r=An(e);return typeof r!="undefined"?fr(r,e,t):O})}var Hn=Vt(Yr());var Fa=0;function kn(e){if(e.nextElementSibling){let t=e.nextElementSibling;if(t.tagName==="OL")return t;if(t.tagName==="P"&&!t.children.length)return kn(t)}}function ja(e){return ge(e).pipe(m(({width:t})=>({scrollable:Tt(e).width>t})),Z("scrollable"))}function $n(e,t){let{matches:r}=matchMedia("(hover)"),o=C(()=>{let n=new g,i=n.pipe(Fr(1));n.subscribe(({scrollable:c})=>{c&&r?e.setAttribute("tabindex","0"):e.removeAttribute("tabindex")});let a=[];if(Hn.default.isSupported()&&(e.closest(".copy")||B("content.code.copy")&&!e.closest(".no-copy"))){let c=e.closest("pre");c.id=`__code_${Fa++}`;let l=wn(c.id);c.insertBefore(l,e),B("content.tooltips")&&a.push(lt(l,{viewport$}))}let s=e.closest(".highlight");if(s instanceof HTMLElement){let c=kn(s);if(typeof c!="undefined"&&(s.classList.contains("annotate")||B("content.code.annotate"))){let l=fr(c,e,t);a.push(ge(s).pipe(U(i),m(({width:f,height:u})=>f&&u),K(),v(f=>f?l:O)))}}return $(":scope > span[id]",e).length&&e.classList.add("md-code__content"),ja(e).pipe(E(c=>n.next(c)),L(()=>n.complete()),m(c=>R({ref:e},c)),Pe(...a))});return B("content.lazy")?tt(e).pipe(b(n=>n),Te(1),v(()=>o)):o}function Wa(e,{target$:t,print$:r}){let o=!0;return S(t.pipe(m(n=>n.closest("details:not([open])")),b(n=>e===n),m(()=>({action:"open",reveal:!0}))),r.pipe(b(n=>n||!o),E(()=>o=e.open),m(n=>({action:n?"open":"close"}))))}function Pn(e,t){return C(()=>{let r=new g;return r.subscribe(({action:o,reveal:n})=>{e.toggleAttribute("open",o==="open"),n&&e.scrollIntoView()}),Wa(e,t).pipe(E(o=>r.next(o)),L(()=>r.complete()),m(o=>R({ref:e},o)))})}var Rn=".node circle,.node ellipse,.node path,.node polygon,.node rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}marker{fill:var(--md-mermaid-edge-color)!important}.edgeLabel .label rect{fill:#0000}.label{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.label foreignObject{line-height:normal;overflow:visible}.label div .edgeLabel{color:var(--md-mermaid-label-fg-color)}.edgeLabel,.edgeLabel rect,.label div .edgeLabel{background-color:var(--md-mermaid-label-bg-color)}.edgeLabel,.edgeLabel rect{fill:var(--md-mermaid-label-bg-color);color:var(--md-mermaid-edge-color)}.edgePath .path,.flowchart-link{stroke:var(--md-mermaid-edge-color);stroke-width:.05rem}.edgePath .arrowheadPath{fill:var(--md-mermaid-edge-color);stroke:none}.cluster rect{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}.cluster span{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}g #flowchart-circleEnd,g #flowchart-circleStart,g #flowchart-crossEnd,g #flowchart-crossStart,g #flowchart-pointEnd,g #flowchart-pointStart{stroke:none}g.classGroup line,g.classGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.classGroup text{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.classLabel .box{fill:var(--md-mermaid-label-bg-color);background-color:var(--md-mermaid-label-bg-color);opacity:1}.classLabel .label{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.node .divider{stroke:var(--md-mermaid-node-fg-color)}.relation{stroke:var(--md-mermaid-edge-color)}.cardinality{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.cardinality text{fill:inherit!important}defs #classDiagram-compositionEnd,defs #classDiagram-compositionStart,defs #classDiagram-dependencyEnd,defs #classDiagram-dependencyStart,defs #classDiagram-extensionEnd,defs #classDiagram-extensionStart{fill:var(--md-mermaid-edge-color)!important;stroke:var(--md-mermaid-edge-color)!important}defs #classDiagram-aggregationEnd,defs #classDiagram-aggregationStart{fill:var(--md-mermaid-label-bg-color)!important;stroke:var(--md-mermaid-edge-color)!important}g.stateGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.stateGroup .state-title{fill:var(--md-mermaid-label-fg-color)!important;font-family:var(--md-mermaid-font-family)}g.stateGroup .composit{fill:var(--md-mermaid-label-bg-color)}.nodeLabel,.nodeLabel p{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}a .nodeLabel{text-decoration:underline}.node circle.state-end,.node circle.state-start,.start-state{fill:var(--md-mermaid-edge-color);stroke:none}.end-state-inner,.end-state-outer{fill:var(--md-mermaid-edge-color)}.end-state-inner,.node circle.state-end{stroke:var(--md-mermaid-label-bg-color)}.transition{stroke:var(--md-mermaid-edge-color)}[id^=state-fork] rect,[id^=state-join] rect{fill:var(--md-mermaid-edge-color)!important;stroke:none!important}.statediagram-cluster.statediagram-cluster .inner{fill:var(--md-default-bg-color)}.statediagram-cluster rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.statediagram-state rect.divider{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}defs #statediagram-barbEnd{stroke:var(--md-mermaid-edge-color)}.attributeBoxEven,.attributeBoxOdd{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityBox{fill:var(--md-mermaid-label-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityLabel{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.relationshipLabelBox{fill:var(--md-mermaid-label-bg-color);fill-opacity:1;background-color:var(--md-mermaid-label-bg-color);opacity:1}.relationshipLabel{fill:var(--md-mermaid-label-fg-color)}.relationshipLine{stroke:var(--md-mermaid-edge-color)}defs #ONE_OR_MORE_END *,defs #ONE_OR_MORE_START *,defs #ONLY_ONE_END *,defs #ONLY_ONE_START *,defs #ZERO_OR_MORE_END *,defs #ZERO_OR_MORE_START *,defs #ZERO_OR_ONE_END *,defs #ZERO_OR_ONE_START *{stroke:var(--md-mermaid-edge-color)!important}defs #ZERO_OR_MORE_END circle,defs #ZERO_OR_MORE_START circle{fill:var(--md-mermaid-label-bg-color)}.actor{fill:var(--md-mermaid-sequence-actor-bg-color);stroke:var(--md-mermaid-sequence-actor-border-color)}text.actor>tspan{fill:var(--md-mermaid-sequence-actor-fg-color);font-family:var(--md-mermaid-font-family)}line{stroke:var(--md-mermaid-sequence-actor-line-color)}.actor-man circle,.actor-man line{fill:var(--md-mermaid-sequence-actorman-bg-color);stroke:var(--md-mermaid-sequence-actorman-line-color)}.messageLine0,.messageLine1{stroke:var(--md-mermaid-sequence-message-line-color)}.note{fill:var(--md-mermaid-sequence-note-bg-color);stroke:var(--md-mermaid-sequence-note-border-color)}.loopText,.loopText>tspan,.messageText,.noteText>tspan{stroke:none;font-family:var(--md-mermaid-font-family)!important}.messageText{fill:var(--md-mermaid-sequence-message-fg-color)}.loopText,.loopText>tspan{fill:var(--md-mermaid-sequence-loop-fg-color)}.noteText>tspan{fill:var(--md-mermaid-sequence-note-fg-color)}#arrowhead path{fill:var(--md-mermaid-sequence-message-line-color);stroke:none}.loopLine{fill:var(--md-mermaid-sequence-loop-bg-color);stroke:var(--md-mermaid-sequence-loop-border-color)}.labelBox{fill:var(--md-mermaid-sequence-label-bg-color);stroke:none}.labelText,.labelText>span{fill:var(--md-mermaid-sequence-label-fg-color);font-family:var(--md-mermaid-font-family)}.sequenceNumber{fill:var(--md-mermaid-sequence-number-fg-color)}rect.rect{fill:var(--md-mermaid-sequence-box-bg-color);stroke:none}rect.rect+text.text{fill:var(--md-mermaid-sequence-box-fg-color)}defs #sequencenumber{fill:var(--md-mermaid-sequence-number-bg-color)!important}";var Br,Da=0;function Va(){return typeof mermaid=="undefined"||mermaid instanceof Element?wt("https://unpkg.com/mermaid@10/dist/mermaid.min.js"):I(void 0)}function In(e){return e.classList.remove("mermaid"),Br||(Br=Va().pipe(E(()=>mermaid.initialize({startOnLoad:!1,themeCSS:Rn,sequence:{actorFontSize:"16px",messageFontSize:"16px",noteFontSize:"16px"}})),m(()=>{}),G(1))),Br.subscribe(()=>ao(this,null,function*(){e.classList.add("mermaid");let t=`__mermaid_${Da++}`,r=x("div",{class:"mermaid"}),o=e.textContent,{svg:n,fn:i}=yield mermaid.render(t,o),a=r.attachShadow({mode:"closed"});a.innerHTML=n,e.replaceWith(r),i==null||i(a)})),Br.pipe(m(()=>({ref:e})))}var Fn=x("table");function jn(e){return e.replaceWith(Fn),Fn.replaceWith(On(e)),I({ref:e})}function Na(e){let t=e.find(r=>r.checked)||e[0];return S(...e.map(r=>d(r,"change").pipe(m(()=>P(`label[for="${r.id}"]`))))).pipe(Q(P(`label[for="${t.id}"]`)),m(r=>({active:r})))}function Wn(e,{viewport$:t,target$:r}){let o=P(".tabbed-labels",e),n=$(":scope > input",e),i=Qr("prev");e.append(i);let a=Qr("next");return e.append(a),C(()=>{let s=new g,p=s.pipe(X(),ne(!0));z([s,ge(e),tt(e)]).pipe(U(p),Le(1,me)).subscribe({next([{active:c},l]){let f=Ue(c),{width:u}=ce(c);e.style.setProperty("--md-indicator-x",`${f.x}px`),e.style.setProperty("--md-indicator-width",`${u}px`);let h=pr(o);(f.x<h.x||f.x+u>h.x+l.width)&&o.scrollTo({left:Math.max(0,f.x-16),behavior:"smooth"})},complete(){e.style.removeProperty("--md-indicator-x"),e.style.removeProperty("--md-indicator-width")}}),z([De(o),ge(o)]).pipe(U(p)).subscribe(([c,l])=>{let f=Tt(o);i.hidden=c.x<16,a.hidden=c.x>f.width-l.width-16}),S(d(i,"click").pipe(m(()=>-1)),d(a,"click").pipe(m(()=>1))).pipe(U(p)).subscribe(c=>{let{width:l}=ce(o);o.scrollBy({left:l*c,behavior:"smooth"})}),r.pipe(U(p),b(c=>n.includes(c))).subscribe(c=>c.click()),o.classList.add("tabbed-labels--linked");for(let c of n){let l=P(`label[for="${c.id}"]`);l.replaceChildren(x("a",{href:`#${l.htmlFor}`,tabIndex:-1},...Array.from(l.childNodes))),d(l.firstElementChild,"click").pipe(U(p),b(f=>!(f.metaKey||f.ctrlKey)),E(f=>{f.preventDefault(),f.stopPropagation()})).subscribe(()=>{history.replaceState({},"",`#${l.htmlFor}`),l.click()})}return B("content.tabs.link")&&s.pipe(Ce(1),ee(t)).subscribe(([{active:c},{offset:l}])=>{let f=c.innerText.trim();if(c.hasAttribute("data-md-switching"))c.removeAttribute("data-md-switching");else{let u=e.offsetTop-l.y;for(let w of $("[data-tabs]"))for(let A of $(":scope > input",w)){let te=P(`label[for="${A.id}"]`);if(te!==c&&te.innerText.trim()===f){te.setAttribute("data-md-switching",""),A.click();break}}window.scrollTo({top:e.offsetTop-u});let h=__md_get("__tabs")||[];__md_set("__tabs",[...new Set([f,...h])])}}),s.pipe(U(p)).subscribe(()=>{for(let c of $("audio, video",e))c.pause()}),Na(n).pipe(E(c=>s.next(c)),L(()=>s.complete()),m(c=>R({ref:e},c)))}).pipe(Qe(se))}function Un(e,{viewport$:t,target$:r,print$:o}){return S(...$(".annotate:not(.highlight)",e).map(n=>Cn(n,{target$:r,print$:o})),...$("pre:not(.mermaid) > code",e).map(n=>$n(n,{target$:r,print$:o})),...$("pre.mermaid",e).map(n=>In(n)),...$("table:not([class])",e).map(n=>jn(n)),...$("details",e).map(n=>Pn(n,{target$:r,print$:o})),...$("[data-tabs]",e).map(n=>Wn(n,{viewport$:t,target$:r})),...$("[title]",e).filter(()=>B("content.tooltips")).map(n=>lt(n,{viewport$:t})))}function za(e,{alert$:t}){return t.pipe(v(r=>S(I(!0),I(!1).pipe(Ge(2e3))).pipe(m(o=>({message:r,active:o})))))}function Dn(e,t){let r=P(".md-typeset",e);return C(()=>{let o=new g;return o.subscribe(({message:n,active:i})=>{e.classList.toggle("md-dialog--active",i),r.textContent=n}),za(e,t).pipe(E(n=>o.next(n)),L(()=>o.complete()),m(n=>R({ref:e},n)))})}var qa=0;function Qa(e,t){document.body.append(e);let{width:r}=ce(e);e.style.setProperty("--md-tooltip-width",`${r}px`),e.remove();let o=cr(t),n=typeof o!="undefined"?De(o):I({x:0,y:0}),i=S(et(t),kt(t)).pipe(K());return z([i,n]).pipe(m(([a,s])=>{let{x:p,y:c}=Ue(t),l=ce(t),f=t.closest("table");return f&&t.parentElement&&(p+=f.offsetLeft+t.parentElement.offsetLeft,c+=f.offsetTop+t.parentElement.offsetTop),{active:a,offset:{x:p-s.x+l.width/2-r/2,y:c-s.y+l.height+8}}}))}function Vn(e){let t=e.title;if(!t.length)return O;let r=`__tooltip_${qa++}`,o=Pt(r,"inline"),n=P(".md-typeset",o);return n.innerHTML=t,C(()=>{let i=new g;return i.subscribe({next({offset:a}){o.style.setProperty("--md-tooltip-x",`${a.x}px`),o.style.setProperty("--md-tooltip-y",`${a.y}px`)},complete(){o.style.removeProperty("--md-tooltip-x"),o.style.removeProperty("--md-tooltip-y")}}),S(i.pipe(b(({active:a})=>a)),i.pipe(_e(250),b(({active:a})=>!a))).subscribe({next({active:a}){a?(e.insertAdjacentElement("afterend",o),e.setAttribute("aria-describedby",r),e.removeAttribute("title")):(o.remove(),e.removeAttribute("aria-describedby"),e.setAttribute("title",t))},complete(){o.remove(),e.removeAttribute("aria-describedby"),e.setAttribute("title",t)}}),i.pipe(Le(16,me)).subscribe(({active:a})=>{o.classList.toggle("md-tooltip--active",a)}),i.pipe(ct(125,me),b(()=>!!e.offsetParent),m(()=>e.offsetParent.getBoundingClientRect()),m(({x:a})=>a)).subscribe({next(a){a?o.style.setProperty("--md-tooltip-0",`${-a}px`):o.style.removeProperty("--md-tooltip-0")},complete(){o.style.removeProperty("--md-tooltip-0")}}),Qa(o,e).pipe(E(a=>i.next(a)),L(()=>i.complete()),m(a=>R({ref:e},a)))}).pipe(Qe(se))}function Ka({viewport$:e}){if(!B("header.autohide"))return I(!1);let t=e.pipe(m(({offset:{y:n}})=>n),Ye(2,1),m(([n,i])=>[n<i,i]),Z(0)),r=z([e,t]).pipe(b(([{offset:n},[,i]])=>Math.abs(i-n.y)>100),m(([,[n]])=>n),K()),o=Ve("search");return z([e,o]).pipe(m(([{offset:n},i])=>n.y>400&&!i),K(),v(n=>n?r:I(!1)),Q(!1))}function Nn(e,t){return C(()=>z([ge(e),Ka(t)])).pipe(m(([{height:r},o])=>({height:r,hidden:o})),K((r,o)=>r.height===o.height&&r.hidden===o.hidden),G(1))}function zn(e,{header$:t,main$:r}){return C(()=>{let o=new g,n=o.pipe(X(),ne(!0));o.pipe(Z("active"),We(t)).subscribe(([{active:a},{hidden:s}])=>{e.classList.toggle("md-header--shadow",a&&!s),e.hidden=s});let i=ue($("[title]",e)).pipe(b(()=>B("content.tooltips")),oe(a=>Vn(a)));return r.subscribe(o),t.pipe(U(n),m(a=>R({ref:e},a)),Pe(i.pipe(U(n))))})}function Ya(e,{viewport$:t,header$:r}){return mr(e,{viewport$:t,header$:r}).pipe(m(({offset:{y:o}})=>{let{height:n}=ce(e);return{active:o>=n}}),Z("active"))}function qn(e,t){return C(()=>{let r=new g;r.subscribe({next({active:n}){e.classList.toggle("md-header__title--active",n)},complete(){e.classList.remove("md-header__title--active")}});let o=fe(".md-content h1");return typeof o=="undefined"?O:Ya(o,t).pipe(E(n=>r.next(n)),L(()=>r.complete()),m(n=>R({ref:e},n)))})}function Qn(e,{viewport$:t,header$:r}){let o=r.pipe(m(({height:i})=>i),K()),n=o.pipe(v(()=>ge(e).pipe(m(({height:i})=>({top:e.offsetTop,bottom:e.offsetTop+i})),Z("bottom"))));return z([o,n,t]).pipe(m(([i,{top:a,bottom:s},{offset:{y:p},size:{height:c}}])=>(c=Math.max(0,c-Math.max(0,a-p,i)-Math.max(0,c+p-s)),{offset:a-i,height:c,active:a-i<=p})),K((i,a)=>i.offset===a.offset&&i.height===a.height&&i.active===a.active))}function Ba(e){let t=__md_get("__palette")||{index:e.findIndex(o=>matchMedia(o.getAttribute("data-md-color-media")).matches)},r=Math.max(0,Math.min(t.index,e.length-1));return I(...e).pipe(oe(o=>d(o,"change").pipe(m(()=>o))),Q(e[r]),m(o=>({index:e.indexOf(o),color:{media:o.getAttribute("data-md-color-media"),scheme:o.getAttribute("data-md-color-scheme"),primary:o.getAttribute("data-md-color-primary"),accent:o.getAttribute("data-md-color-accent")}})),G(1))}function Kn(e){let t=$("input",e),r=x("meta",{name:"theme-color"});document.head.appendChild(r);let o=x("meta",{name:"color-scheme"});document.head.appendChild(o);let n=$t("(prefers-color-scheme: light)");return C(()=>{let i=new g;return i.subscribe(a=>{if(document.body.setAttribute("data-md-color-switching",""),a.color.media==="(prefers-color-scheme)"){let s=matchMedia("(prefers-color-scheme: light)"),p=document.querySelector(s.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");a.color.scheme=p.getAttribute("data-md-color-scheme"),a.color.primary=p.getAttribute("data-md-color-primary"),a.color.accent=p.getAttribute("data-md-color-accent")}for(let[s,p]of Object.entries(a.color))document.body.setAttribute(`data-md-color-${s}`,p);for(let s=0;s<t.length;s++){let p=t[s].nextElementSibling;p instanceof HTMLElement&&(p.hidden=a.index!==s)}__md_set("__palette",a)}),d(e,"keydown").pipe(b(a=>a.key==="Enter"),ee(i,(a,s)=>s)).subscribe(({index:a})=>{a=(a+1)%t.length,t[a].click(),t[a].focus()}),i.pipe(m(()=>{let a=Se("header"),s=window.getComputedStyle(a);return o.content=s.colorScheme,s.backgroundColor.match(/\d+/g).map(p=>(+p).toString(16).padStart(2,"0")).join("")})).subscribe(a=>r.content=`#${a}`),i.pipe(be(se)).subscribe(()=>{document.body.removeAttribute("data-md-color-switching")}),Ba(t).pipe(U(n.pipe(Ce(1))),st(),E(a=>i.next(a)),L(()=>i.complete()),m(a=>R({ref:e},a)))})}function Yn(e,{progress$:t}){return C(()=>{let r=new g;return r.subscribe(({value:o})=>{e.style.setProperty("--md-progress-value",`${o}`)}),t.pipe(E(o=>r.next({value:o})),L(()=>r.complete()),m(o=>({ref:e,value:o})))})}var Gr=Vt(Yr());function Ga(e){e.setAttribute("data-md-copying","");let t=e.closest("[data-copy]"),r=t?t.getAttribute("data-copy"):e.innerText;return e.removeAttribute("data-md-copying"),r.trimEnd()}function Bn({alert$:e}){Gr.default.isSupported()&&new F(t=>{new Gr.default("[data-clipboard-target], [data-clipboard-text]",{text:r=>r.getAttribute("data-clipboard-text")||Ga(P(r.getAttribute("data-clipboard-target")))}).on("success",r=>t.next(r))}).pipe(E(t=>{t.trigger.focus()}),m(()=>Ee("clipboard.copied"))).subscribe(e)}function Gn(e,t){return e.protocol=t.protocol,e.hostname=t.hostname,e}function Ja(e,t){let r=new Map;for(let o of $("url",e)){let n=P("loc",o),i=[Gn(new URL(n.textContent),t)];r.set(`${i[0]}`,i);for(let a of $("[rel=alternate]",o)){let s=a.getAttribute("href");s!=null&&i.push(Gn(new URL(s),t))}}return r}function ur(e){return mn(new URL("sitemap.xml",e)).pipe(m(t=>Ja(t,new URL(e))),ve(()=>I(new Map)))}function Xa(e,t){if(!(e.target instanceof Element))return O;let r=e.target.closest("a");if(r===null)return O;if(r.target||e.metaKey||e.ctrlKey)return O;let o=new URL(r.href);return o.search=o.hash="",t.has(`${o}`)?(e.preventDefault(),I(new URL(r.href))):O}function Jn(e){let t=new Map;for(let r of $(":scope > *",e.head))t.set(r.outerHTML,r);return t}function Xn(e){for(let t of $("[href], [src]",e))for(let r of["href","src"]){let o=t.getAttribute(r);if(o&&!/^(?:[a-z]+:)?\/\//i.test(o)){t[r]=t[r];break}}return I(e)}function Za(e){for(let o of["[data-md-component=announce]","[data-md-component=container]","[data-md-component=header-topic]","[data-md-component=outdated]","[data-md-component=logo]","[data-md-component=skip]",...B("navigation.tabs.sticky")?["[data-md-component=tabs]"]:[]]){let n=fe(o),i=fe(o,e);typeof n!="undefined"&&typeof i!="undefined"&&n.replaceWith(i)}let t=Jn(document);for(let[o,n]of Jn(e))t.has(o)?t.delete(o):document.head.appendChild(n);for(let o of t.values()){let n=o.getAttribute("name");n!=="theme-color"&&n!=="color-scheme"&&o.remove()}let r=Se("container");return je($("script",r)).pipe(v(o=>{let n=e.createElement("script");if(o.src){for(let i of o.getAttributeNames())n.setAttribute(i,o.getAttribute(i));return o.replaceWith(n),new F(i=>{n.onload=()=>i.complete()})}else return n.textContent=o.textContent,o.replaceWith(n),O}),X(),ne(document))}function Zn({location$:e,viewport$:t,progress$:r}){let o=ye();if(location.protocol==="file:")return O;let n=ur(o.base);I(document).subscribe(Xn);let i=d(document.body,"click").pipe(We(n),v(([p,c])=>Xa(p,c)),pe()),a=d(window,"popstate").pipe(m(xe),pe());i.pipe(ee(t)).subscribe(([p,{offset:c}])=>{history.replaceState(c,""),history.pushState(null,"",p)}),S(i,a).subscribe(e);let s=e.pipe(Z("pathname"),v(p=>ln(p,{progress$:r}).pipe(ve(()=>(pt(p,!0),O)))),v(Xn),v(Za),pe());return S(s.pipe(ee(e,(p,c)=>c)),s.pipe(v(()=>e),Z("pathname"),v(()=>e),Z("hash")),e.pipe(K((p,c)=>p.pathname===c.pathname&&p.hash===c.hash),v(()=>i),E(()=>history.back()))).subscribe(p=>{var c,l;history.state!==null||!p.hash?window.scrollTo(0,(l=(c=history.state)==null?void 0:c.y)!=null?l:0):(history.scrollRestoration="auto",sn(p.hash),history.scrollRestoration="manual")}),e.subscribe(()=>{history.scrollRestoration="manual"}),d(window,"beforeunload").subscribe(()=>{history.scrollRestoration="auto"}),t.pipe(Z("offset"),_e(100)).subscribe(({offset:p})=>{history.replaceState(p,"")}),s}var ri=Vt(ti());function oi(e){let t=e.separator.split("|").map(n=>n.replace(/(\(\?[!=<][^)]+\))/g,"").length===0?"\uFFFD":n).join("|"),r=new RegExp(t,"img"),o=(n,i,a)=>`${i}<mark data-md-highlight>${a}</mark>`;return n=>{n=n.replace(/[\s*+\-:~^]+/g," ").trim();let i=new RegExp(`(^|${e.separator}|)(${n.replace(/[|\\{}()[\]^$+*?.-]/g,"\\$&").replace(r,"|")})`,"img");return a=>(0,ri.default)(a).replace(i,o).replace(/<\/mark>(\s+)<mark[^>]*>/img,"$1")}}function It(e){return e.type===1}function dr(e){return e.type===3}function ni(e,t){let r=vn(e);return S(I(location.protocol!=="file:"),Ve("search")).pipe(Ae(o=>o),v(()=>t)).subscribe(({config:o,docs:n})=>r.next({type:0,data:{config:o,docs:n,options:{suggest:B("search.suggest")}}})),r}function ii({document$:e}){let t=ye(),r=Ne(new URL("../versions.json",t.base)).pipe(ve(()=>O)),o=r.pipe(m(n=>{let[,i]=t.base.match(/([^/]+)\/?$/);return n.find(({version:a,aliases:s})=>a===i||s.includes(i))||n[0]}));r.pipe(m(n=>new Map(n.map(i=>[`${new URL(`../${i.version}/`,t.base)}`,i]))),v(n=>d(document.body,"click").pipe(b(i=>!i.metaKey&&!i.ctrlKey),ee(o),v(([i,a])=>{if(i.target instanceof Element){let s=i.target.closest("a");if(s&&!s.target&&n.has(s.href)){let p=s.href;return!i.target.closest(".md-version")&&n.get(p)===a?O:(i.preventDefault(),I(p))}}return O}),v(i=>ur(new URL(i)).pipe(m(a=>{let p=xe().href.replace(t.base,i);return a.has(p.split("#")[0])?new URL(p):new URL(i)})))))).subscribe(n=>pt(n,!0)),z([r,o]).subscribe(([n,i])=>{P(".md-header__topic").appendChild(Mn(n,i))}),e.pipe(v(()=>o)).subscribe(n=>{var a;let i=__md_get("__outdated",sessionStorage);if(i===null){i=!0;let s=((a=t.version)==null?void 0:a.default)||"latest";Array.isArray(s)||(s=[s]);e:for(let p of s)for(let c of n.aliases.concat(n.version))if(new RegExp(p,"i").test(c)){i=!1;break e}__md_set("__outdated",i,sessionStorage)}if(i)for(let s of ae("outdated"))s.hidden=!1})}function ns(e,{worker$:t}){let{searchParams:r}=xe();r.has("q")&&(Je("search",!0),e.value=r.get("q"),e.focus(),Ve("search").pipe(Ae(i=>!i)).subscribe(()=>{let i=xe();i.searchParams.delete("q"),history.replaceState({},"",`${i}`)}));let o=et(e),n=S(t.pipe(Ae(It)),d(e,"keyup"),o).pipe(m(()=>e.value),K());return z([n,o]).pipe(m(([i,a])=>({value:i,focus:a})),G(1))}function ai(e,{worker$:t}){let r=new g,o=r.pipe(X(),ne(!0));z([t.pipe(Ae(It)),r],(i,a)=>a).pipe(Z("value")).subscribe(({value:i})=>t.next({type:2,data:i})),r.pipe(Z("focus")).subscribe(({focus:i})=>{i&&Je("search",i)}),d(e.form,"reset").pipe(U(o)).subscribe(()=>e.focus());let n=P("header [for=__search]");return d(n,"click").subscribe(()=>e.focus()),ns(e,{worker$:t}).pipe(E(i=>r.next(i)),L(()=>r.complete()),m(i=>R({ref:e},i)),G(1))}function si(e,{worker$:t,query$:r}){let o=new g,n=tn(e.parentElement).pipe(b(Boolean)),i=e.parentElement,a=P(":scope > :first-child",e),s=P(":scope > :last-child",e);Ve("search").subscribe(l=>s.setAttribute("role",l?"list":"presentation")),o.pipe(ee(r),Ur(t.pipe(Ae(It)))).subscribe(([{items:l},{value:f}])=>{switch(l.length){case 0:a.textContent=f.length?Ee("search.result.none"):Ee("search.result.placeholder");break;case 1:a.textContent=Ee("search.result.one");break;default:let u=sr(l.length);a.textContent=Ee("search.result.other",u)}});let p=o.pipe(E(()=>s.innerHTML=""),v(({items:l})=>S(I(...l.slice(0,10)),I(...l.slice(10)).pipe(Ye(4),Vr(n),v(([f])=>f)))),m(Tn),pe());return p.subscribe(l=>s.appendChild(l)),p.pipe(oe(l=>{let f=fe("details",l);return typeof f=="undefined"?O:d(f,"toggle").pipe(U(o),m(()=>f))})).subscribe(l=>{l.open===!1&&l.offsetTop<=i.scrollTop&&i.scrollTo({top:l.offsetTop})}),t.pipe(b(dr),m(({data:l})=>l)).pipe(E(l=>o.next(l)),L(()=>o.complete()),m(l=>R({ref:e},l)))}function is(e,{query$:t}){return t.pipe(m(({value:r})=>{let o=xe();return o.hash="",r=r.replace(/\s+/g,"+").replace(/&/g,"%26").replace(/=/g,"%3D"),o.search=`q=${r}`,{url:o}}))}function ci(e,t){let r=new g,o=r.pipe(X(),ne(!0));return r.subscribe(({url:n})=>{e.setAttribute("data-clipboard-text",e.href),e.href=`${n}`}),d(e,"click").pipe(U(o)).subscribe(n=>n.preventDefault()),is(e,t).pipe(E(n=>r.next(n)),L(()=>r.complete()),m(n=>R({ref:e},n)))}function pi(e,{worker$:t,keyboard$:r}){let o=new g,n=Se("search-query"),i=S(d(n,"keydown"),d(n,"focus")).pipe(be(se),m(()=>n.value),K());return o.pipe(We(i),m(([{suggest:s},p])=>{let c=p.split(/([\s-]+)/);if(s!=null&&s.length&&c[c.length-1]){let l=s[s.length-1];l.startsWith(c[c.length-1])&&(c[c.length-1]=l)}else c.length=0;return c})).subscribe(s=>e.innerHTML=s.join("").replace(/\s/g,"&nbsp;")),r.pipe(b(({mode:s})=>s==="search")).subscribe(s=>{switch(s.type){case"ArrowRight":e.innerText.length&&n.selectionStart===n.value.length&&(n.value=e.innerText);break}}),t.pipe(b(dr),m(({data:s})=>s)).pipe(E(s=>o.next(s)),L(()=>o.complete()),m(()=>({ref:e})))}function li(e,{index$:t,keyboard$:r}){let o=ye();try{let n=ni(o.search,t),i=Se("search-query",e),a=Se("search-result",e);d(e,"click").pipe(b(({target:p})=>p instanceof Element&&!!p.closest("a"))).subscribe(()=>Je("search",!1)),r.pipe(b(({mode:p})=>p==="search")).subscribe(p=>{let c=Re();switch(p.type){case"Enter":if(c===i){let l=new Map;for(let f of $(":first-child [href]",a)){let u=f.firstElementChild;l.set(f,parseFloat(u.getAttribute("data-md-score")))}if(l.size){let[[f]]=[...l].sort(([,u],[,h])=>h-u);f.click()}p.claim()}break;case"Escape":case"Tab":Je("search",!1),i.blur();break;case"ArrowUp":case"ArrowDown":if(typeof c=="undefined")i.focus();else{let l=[i,...$(":not(details) > [href], summary, details[open] [href]",a)],f=Math.max(0,(Math.max(0,l.indexOf(c))+l.length+(p.type==="ArrowUp"?-1:1))%l.length);l[f].focus()}p.claim();break;default:i!==Re()&&i.focus()}}),r.pipe(b(({mode:p})=>p==="global")).subscribe(p=>{switch(p.type){case"f":case"s":case"/":i.focus(),i.select(),p.claim();break}});let s=ai(i,{worker$:n});return S(s,si(a,{worker$:n,query$:s})).pipe(Pe(...ae("search-share",e).map(p=>ci(p,{query$:s})),...ae("search-suggest",e).map(p=>pi(p,{worker$:n,keyboard$:r}))))}catch(n){return e.hidden=!0,Ke}}function mi(e,{index$:t,location$:r}){return z([t,r.pipe(Q(xe()),b(o=>!!o.searchParams.get("h")))]).pipe(m(([o,n])=>oi(o.config)(n.searchParams.get("h"))),m(o=>{var a;let n=new Map,i=document.createNodeIterator(e,NodeFilter.SHOW_TEXT);for(let s=i.nextNode();s;s=i.nextNode())if((a=s.parentElement)!=null&&a.offsetHeight){let p=s.textContent,c=o(p);c.length>p.length&&n.set(s,c)}for(let[s,p]of n){let{childNodes:c}=x("span",null,p);s.replaceWith(...Array.from(c))}return{ref:e,nodes:n}}))}function as(e,{viewport$:t,main$:r}){let o=e.closest(".md-grid"),n=o.offsetTop-o.parentElement.offsetTop;return z([r,t]).pipe(m(([{offset:i,height:a},{offset:{y:s}}])=>(a=a+Math.min(n,Math.max(0,s-i))-n,{height:a,locked:s>=i+n})),K((i,a)=>i.height===a.height&&i.locked===a.locked))}function Jr(e,o){var n=o,{header$:t}=n,r=io(n,["header$"]);let i=P(".md-sidebar__scrollwrap",e),{y:a}=Ue(i);return C(()=>{let s=new g,p=s.pipe(X(),ne(!0)),c=s.pipe(Le(0,me));return c.pipe(ee(t)).subscribe({next([{height:l},{height:f}]){i.style.height=`${l-2*a}px`,e.style.top=`${f}px`},complete(){i.style.height="",e.style.top=""}}),c.pipe(Ae()).subscribe(()=>{for(let l of $(".md-nav__link--active[href]",e)){if(!l.clientHeight)continue;let f=l.closest(".md-sidebar__scrollwrap");if(typeof f!="undefined"){let u=l.offsetTop-f.offsetTop,{height:h}=ce(f);f.scrollTo({top:u-h/2})}}}),ue($("label[tabindex]",e)).pipe(oe(l=>d(l,"click").pipe(be(se),m(()=>l),U(p)))).subscribe(l=>{let f=P(`[id="${l.htmlFor}"]`);P(`[aria-labelledby="${l.id}"]`).setAttribute("aria-expanded",`${f.checked}`)}),as(e,r).pipe(E(l=>s.next(l)),L(()=>s.complete()),m(l=>R({ref:e},l)))})}function fi(e,t){if(typeof t!="undefined"){let r=`https://api.github.com/repos/${e}/${t}`;return Ct(Ne(`${r}/releases/latest`).pipe(ve(()=>O),m(o=>({version:o.tag_name})),Be({})),Ne(r).pipe(ve(()=>O),m(o=>({stars:o.stargazers_count,forks:o.forks_count})),Be({}))).pipe(m(([o,n])=>R(R({},o),n)))}else{let r=`https://api.github.com/users/${e}`;return Ne(r).pipe(m(o=>({repositories:o.public_repos})),Be({}))}}function ui(e,t){let r=`https://${e}/api/v4/projects/${encodeURIComponent(t)}`;return Ne(r).pipe(ve(()=>O),m(({star_count:o,forks_count:n})=>({stars:o,forks:n})),Be({}))}function di(e){let t=e.match(/^.+github\.com\/([^/]+)\/?([^/]+)?/i);if(t){let[,r,o]=t;return fi(r,o)}if(t=e.match(/^.+?([^/]*gitlab[^/]+)\/(.+?)\/?$/i),t){let[,r,o]=t;return ui(r,o)}return O}var ss;function cs(e){return ss||(ss=C(()=>{let t=__md_get("__source",sessionStorage);if(t)return I(t);if(ae("consent").length){let o=__md_get("__consent");if(!(o&&o.github))return O}return di(e.href).pipe(E(o=>__md_set("__source",o,sessionStorage)))}).pipe(ve(()=>O),b(t=>Object.keys(t).length>0),m(t=>({facts:t})),G(1)))}function hi(e){let t=P(":scope > :last-child",e);return C(()=>{let r=new g;return r.subscribe(({facts:o})=>{t.appendChild(Sn(o)),t.classList.add("md-source__repository--active")}),cs(e).pipe(E(o=>r.next(o)),L(()=>r.complete()),m(o=>R({ref:e},o)))})}function ps(e,{viewport$:t,header$:r}){return ge(document.body).pipe(v(()=>mr(e,{header$:r,viewport$:t})),m(({offset:{y:o}})=>({hidden:o>=10})),Z("hidden"))}function bi(e,t){return C(()=>{let r=new g;return r.subscribe({next({hidden:o}){e.hidden=o},complete(){e.hidden=!1}}),(B("navigation.tabs.sticky")?I({hidden:!1}):ps(e,t)).pipe(E(o=>r.next(o)),L(()=>r.complete()),m(o=>R({ref:e},o)))})}function ls(e,{viewport$:t,header$:r}){let o=new Map,n=$(".md-nav__link",e);for(let s of n){let p=decodeURIComponent(s.hash.substring(1)),c=fe(`[id="${p}"]`);typeof c!="undefined"&&o.set(s,c)}let i=r.pipe(Z("height"),m(({height:s})=>{let p=Se("main"),c=P(":scope > :first-child",p);return s+.8*(c.offsetTop-p.offsetTop)}),pe());return ge(document.body).pipe(Z("height"),v(s=>C(()=>{let p=[];return I([...o].reduce((c,[l,f])=>{for(;p.length&&o.get(p[p.length-1]).tagName>=f.tagName;)p.pop();let u=f.offsetTop;for(;!u&&f.parentElement;)f=f.parentElement,u=f.offsetTop;let h=f.offsetParent;for(;h;h=h.offsetParent)u+=h.offsetTop;return c.set([...p=[...p,l]].reverse(),u)},new Map))}).pipe(m(p=>new Map([...p].sort(([,c],[,l])=>c-l))),We(i),v(([p,c])=>t.pipe(jr(([l,f],{offset:{y:u},size:h})=>{let w=u+h.height>=Math.floor(s.height);for(;f.length;){let[,A]=f[0];if(A-c<u||w)l=[...l,f.shift()];else break}for(;l.length;){let[,A]=l[l.length-1];if(A-c>=u&&!w)f=[l.pop(),...f];else break}return[l,f]},[[],[...p]]),K((l,f)=>l[0]===f[0]&&l[1]===f[1])))))).pipe(m(([s,p])=>({prev:s.map(([c])=>c),next:p.map(([c])=>c)})),Q({prev:[],next:[]}),Ye(2,1),m(([s,p])=>s.prev.length<p.prev.length?{prev:p.prev.slice(Math.max(0,s.prev.length-1),p.prev.length),next:[]}:{prev:p.prev.slice(-1),next:p.next.slice(0,p.next.length-s.next.length)}))}function vi(e,{viewport$:t,header$:r,main$:o,target$:n}){return C(()=>{let i=new g,a=i.pipe(X(),ne(!0));if(i.subscribe(({prev:s,next:p})=>{for(let[c]of p)c.classList.remove("md-nav__link--passed"),c.classList.remove("md-nav__link--active");for(let[c,[l]]of s.entries())l.classList.add("md-nav__link--passed"),l.classList.toggle("md-nav__link--active",c===s.length-1)}),B("toc.follow")){let s=S(t.pipe(_e(1),m(()=>{})),t.pipe(_e(250),m(()=>"smooth")));i.pipe(b(({prev:p})=>p.length>0),We(o.pipe(be(se))),ee(s)).subscribe(([[{prev:p}],c])=>{let[l]=p[p.length-1];if(l.offsetHeight){let f=cr(l);if(typeof f!="undefined"){let u=l.offsetTop-f.offsetTop,{height:h}=ce(f);f.scrollTo({top:u-h/2,behavior:c})}}})}return B("navigation.tracking")&&t.pipe(U(a),Z("offset"),_e(250),Ce(1),U(n.pipe(Ce(1))),st({delay:250}),ee(i)).subscribe(([,{prev:s}])=>{let p=xe(),c=s[s.length-1];if(c&&c.length){let[l]=c,{hash:f}=new URL(l.href);p.hash!==f&&(p.hash=f,history.replaceState({},"",`${p}`))}else p.hash="",history.replaceState({},"",`${p}`)}),ls(e,{viewport$:t,header$:r}).pipe(E(s=>i.next(s)),L(()=>i.complete()),m(s=>R({ref:e},s)))})}function ms(e,{viewport$:t,main$:r,target$:o}){let n=t.pipe(m(({offset:{y:a}})=>a),Ye(2,1),m(([a,s])=>a>s&&s>0),K()),i=r.pipe(m(({active:a})=>a));return z([i,n]).pipe(m(([a,s])=>!(a&&s)),K(),U(o.pipe(Ce(1))),ne(!0),st({delay:250}),m(a=>({hidden:a})))}function gi(e,{viewport$:t,header$:r,main$:o,target$:n}){let i=new g,a=i.pipe(X(),ne(!0));return i.subscribe({next({hidden:s}){e.hidden=s,s?(e.setAttribute("tabindex","-1"),e.blur()):e.removeAttribute("tabindex")},complete(){e.style.top="",e.hidden=!0,e.removeAttribute("tabindex")}}),r.pipe(U(a),Z("height")).subscribe(({height:s})=>{e.style.top=`${s+16}px`}),d(e,"click").subscribe(s=>{s.preventDefault(),window.scrollTo({top:0})}),ms(e,{viewport$:t,main$:o,target$:n}).pipe(E(s=>i.next(s)),L(()=>i.complete()),m(s=>R({ref:e},s)))}function xi({document$:e,viewport$:t}){e.pipe(v(()=>$(".md-ellipsis")),oe(r=>tt(r).pipe(U(e.pipe(Ce(1))),b(o=>o),m(()=>r),Te(1))),b(r=>r.offsetWidth<r.scrollWidth),oe(r=>{let o=r.innerText,n=r.closest("a")||r;return n.title=o,B("content.tooltips")?lt(n,{viewport$:t}).pipe(U(e.pipe(Ce(1))),L(()=>n.removeAttribute("title"))):O})).subscribe(),B("content.tooltips")&&e.pipe(v(()=>$(".md-status")),oe(r=>lt(r,{viewport$:t}))).subscribe()}function yi({document$:e,tablet$:t}){e.pipe(v(()=>$(".md-toggle--indeterminate")),E(r=>{r.indeterminate=!0,r.checked=!1}),oe(r=>d(r,"change").pipe(Dr(()=>r.classList.contains("md-toggle--indeterminate")),m(()=>r))),ee(t)).subscribe(([r,o])=>{r.classList.remove("md-toggle--indeterminate"),o&&(r.checked=!1)})}function fs(){return/(iPad|iPhone|iPod)/.test(navigator.userAgent)}function Ei({document$:e}){e.pipe(v(()=>$("[data-md-scrollfix]")),E(t=>t.removeAttribute("data-md-scrollfix")),b(fs),oe(t=>d(t,"touchstart").pipe(m(()=>t)))).subscribe(t=>{let r=t.scrollTop;r===0?t.scrollTop=1:r+t.offsetHeight===t.scrollHeight&&(t.scrollTop=r-1)})}function wi({viewport$:e,tablet$:t}){z([Ve("search"),t]).pipe(m(([r,o])=>r&&!o),v(r=>I(r).pipe(Ge(r?400:100))),ee(e)).subscribe(([r,{offset:{y:o}}])=>{if(r)document.body.setAttribute("data-md-scrolllock",""),document.body.style.top=`-${o}px`;else{let n=-1*parseInt(document.body.style.top,10);document.body.removeAttribute("data-md-scrolllock"),document.body.style.top="",n&&window.scrollTo(0,n)}})}Object.entries||(Object.entries=function(e){let t=[];for(let r of Object.keys(e))t.push([r,e[r]]);return t});Object.values||(Object.values=function(e){let t=[];for(let r of Object.keys(e))t.push(e[r]);return t});typeof Element!="undefined"&&(Element.prototype.scrollTo||(Element.prototype.scrollTo=function(e,t){typeof e=="object"?(this.scrollLeft=e.left,this.scrollTop=e.top):(this.scrollLeft=e,this.scrollTop=t)}),Element.prototype.replaceWith||(Element.prototype.replaceWith=function(...e){let t=this.parentNode;if(t){e.length===0&&t.removeChild(this);for(let r=e.length-1;r>=0;r--){let o=e[r];typeof o=="string"?o=document.createTextNode(o):o.parentNode&&o.parentNode.removeChild(o),r?t.insertBefore(this.previousSibling,o):t.replaceChild(o,this)}}}));function us(){return location.protocol==="file:"?wt(`${new URL("search/search_index.js",Xr.base)}`).pipe(m(()=>__index),G(1)):Ne(new URL("search/search_index.json",Xr.base))}document.documentElement.classList.remove("no-js");document.documentElement.classList.add("js");var ot=Yo(),jt=nn(),Ot=cn(jt),Zr=on(),Oe=bn(),hr=$t("(min-width: 960px)"),Si=$t("(min-width: 1220px)"),Oi=pn(),Xr=ye(),Mi=document.forms.namedItem("search")?us():Ke,eo=new g;Bn({alert$:eo});var to=new g;B("navigation.instant")&&Zn({location$:jt,viewport$:Oe,progress$:to}).subscribe(ot);var Ti;((Ti=Xr.version)==null?void 0:Ti.provider)==="mike"&&ii({document$:ot});S(jt,Ot).pipe(Ge(125)).subscribe(()=>{Je("drawer",!1),Je("search",!1)});Zr.pipe(b(({mode:e})=>e==="global")).subscribe(e=>{switch(e.type){case"p":case",":let t=fe("link[rel=prev]");typeof t!="undefined"&&pt(t);break;case"n":case".":let r=fe("link[rel=next]");typeof r!="undefined"&&pt(r);break;case"Enter":let o=Re();o instanceof HTMLLabelElement&&o.click()}});xi({viewport$:Oe,document$:ot});yi({document$:ot,tablet$:hr});Ei({document$:ot});wi({viewport$:Oe,tablet$:hr});var rt=Nn(Se("header"),{viewport$:Oe}),Ft=ot.pipe(m(()=>Se("main")),v(e=>Qn(e,{viewport$:Oe,header$:rt})),G(1)),ds=S(...ae("consent").map(e=>xn(e,{target$:Ot})),...ae("dialog").map(e=>Dn(e,{alert$:eo})),...ae("header").map(e=>zn(e,{viewport$:Oe,header$:rt,main$:Ft})),...ae("palette").map(e=>Kn(e)),...ae("progress").map(e=>Yn(e,{progress$:to})),...ae("search").map(e=>li(e,{index$:Mi,keyboard$:Zr})),...ae("source").map(e=>hi(e))),hs=C(()=>S(...ae("announce").map(e=>gn(e)),...ae("content").map(e=>Un(e,{viewport$:Oe,target$:Ot,print$:Oi})),...ae("content").map(e=>B("search.highlight")?mi(e,{index$:Mi,location$:jt}):O),...ae("header-title").map(e=>qn(e,{viewport$:Oe,header$:rt})),...ae("sidebar").map(e=>e.getAttribute("data-md-type")==="navigation"?Nr(Si,()=>Jr(e,{viewport$:Oe,header$:rt,main$:Ft})):Nr(hr,()=>Jr(e,{viewport$:Oe,header$:rt,main$:Ft}))),...ae("tabs").map(e=>bi(e,{viewport$:Oe,header$:rt})),...ae("toc").map(e=>vi(e,{viewport$:Oe,header$:rt,main$:Ft,target$:Ot})),...ae("top").map(e=>gi(e,{viewport$:Oe,header$:rt,main$:Ft,target$:Ot})))),Li=ot.pipe(v(()=>hs),Pe(ds),G(1));Li.subscribe();window.document$=ot;window.location$=jt;window.target$=Ot;window.keyboard$=Zr;window.viewport$=Oe;window.tablet$=hr;window.screen$=Si;window.print$=Oi;window.alert$=eo;window.progress$=to;window.component$=Li;})();
+//# sourceMappingURL=bundle.fe8b6f2b.min.js.map
+
diff --git a/assets/javascripts/bundle.fe8b6f2b.min.js.map b/assets/javascripts/bundle.fe8b6f2b.min.js.map
new file mode 100644
index 0000000..8263585
--- /dev/null
+++ b/assets/javascripts/bundle.fe8b6f2b.min.js.map
@@ -0,0 +1,7 @@
+{
+  "version": 3,
+  "sources": ["node_modules/focus-visible/dist/focus-visible.js", "node_modules/clipboard/dist/clipboard.js", "node_modules/escape-html/index.js", "src/templates/assets/javascripts/bundle.ts", "node_modules/rxjs/node_modules/tslib/tslib.es6.js", "node_modules/rxjs/src/internal/util/isFunction.ts", "node_modules/rxjs/src/internal/util/createErrorClass.ts", "node_modules/rxjs/src/internal/util/UnsubscriptionError.ts", "node_modules/rxjs/src/internal/util/arrRemove.ts", "node_modules/rxjs/src/internal/Subscription.ts", "node_modules/rxjs/src/internal/config.ts", "node_modules/rxjs/src/internal/scheduler/timeoutProvider.ts", "node_modules/rxjs/src/internal/util/reportUnhandledError.ts", "node_modules/rxjs/src/internal/util/noop.ts", "node_modules/rxjs/src/internal/NotificationFactories.ts", "node_modules/rxjs/src/internal/util/errorContext.ts", "node_modules/rxjs/src/internal/Subscriber.ts", "node_modules/rxjs/src/internal/symbol/observable.ts", "node_modules/rxjs/src/internal/util/identity.ts", "node_modules/rxjs/src/internal/util/pipe.ts", "node_modules/rxjs/src/internal/Observable.ts", "node_modules/rxjs/src/internal/util/lift.ts", "node_modules/rxjs/src/internal/operators/OperatorSubscriber.ts", "node_modules/rxjs/src/internal/scheduler/animationFrameProvider.ts", "node_modules/rxjs/src/internal/util/ObjectUnsubscribedError.ts", "node_modules/rxjs/src/internal/Subject.ts", "node_modules/rxjs/src/internal/BehaviorSubject.ts", "node_modules/rxjs/src/internal/scheduler/dateTimestampProvider.ts", "node_modules/rxjs/src/internal/ReplaySubject.ts", "node_modules/rxjs/src/internal/scheduler/Action.ts", "node_modules/rxjs/src/internal/scheduler/intervalProvider.ts", "node_modules/rxjs/src/internal/scheduler/AsyncAction.ts", "node_modules/rxjs/src/internal/Scheduler.ts", "node_modules/rxjs/src/internal/scheduler/AsyncScheduler.ts", "node_modules/rxjs/src/internal/scheduler/async.ts", "node_modules/rxjs/src/internal/scheduler/QueueAction.ts", "node_modules/rxjs/src/internal/scheduler/QueueScheduler.ts", "node_modules/rxjs/src/internal/scheduler/queue.ts", "node_modules/rxjs/src/internal/scheduler/AnimationFrameAction.ts", "node_modules/rxjs/src/internal/scheduler/AnimationFrameScheduler.ts", "node_modules/rxjs/src/internal/scheduler/animationFrame.ts", "node_modules/rxjs/src/internal/observable/empty.ts", "node_modules/rxjs/src/internal/util/isScheduler.ts", "node_modules/rxjs/src/internal/util/args.ts", "node_modules/rxjs/src/internal/util/isArrayLike.ts", "node_modules/rxjs/src/internal/util/isPromise.ts", "node_modules/rxjs/src/internal/util/isInteropObservable.ts", "node_modules/rxjs/src/internal/util/isAsyncIterable.ts", "node_modules/rxjs/src/internal/util/throwUnobservableError.ts", "node_modules/rxjs/src/internal/symbol/iterator.ts", "node_modules/rxjs/src/internal/util/isIterable.ts", "node_modules/rxjs/src/internal/util/isReadableStreamLike.ts", "node_modules/rxjs/src/internal/observable/innerFrom.ts", "node_modules/rxjs/src/internal/util/executeSchedule.ts", "node_modules/rxjs/src/internal/operators/observeOn.ts", "node_modules/rxjs/src/internal/operators/subscribeOn.ts", "node_modules/rxjs/src/internal/scheduled/scheduleObservable.ts", "node_modules/rxjs/src/internal/scheduled/schedulePromise.ts", "node_modules/rxjs/src/internal/scheduled/scheduleArray.ts", "node_modules/rxjs/src/internal/scheduled/scheduleIterable.ts", "node_modules/rxjs/src/internal/scheduled/scheduleAsyncIterable.ts", "node_modules/rxjs/src/internal/scheduled/scheduleReadableStreamLike.ts", "node_modules/rxjs/src/internal/scheduled/scheduled.ts", "node_modules/rxjs/src/internal/observable/from.ts", "node_modules/rxjs/src/internal/observable/of.ts", "node_modules/rxjs/src/internal/observable/throwError.ts", "node_modules/rxjs/src/internal/util/EmptyError.ts", "node_modules/rxjs/src/internal/util/isDate.ts", "node_modules/rxjs/src/internal/operators/map.ts", "node_modules/rxjs/src/internal/util/mapOneOrManyArgs.ts", "node_modules/rxjs/src/internal/util/argsArgArrayOrObject.ts", "node_modules/rxjs/src/internal/util/createObject.ts", "node_modules/rxjs/src/internal/observable/combineLatest.ts", "node_modules/rxjs/src/internal/operators/mergeInternals.ts", "node_modules/rxjs/src/internal/operators/mergeMap.ts", "node_modules/rxjs/src/internal/operators/mergeAll.ts", "node_modules/rxjs/src/internal/operators/concatAll.ts", "node_modules/rxjs/src/internal/observable/concat.ts", "node_modules/rxjs/src/internal/observable/defer.ts", "node_modules/rxjs/src/internal/observable/fromEvent.ts", "node_modules/rxjs/src/internal/observable/fromEventPattern.ts", "node_modules/rxjs/src/internal/observable/timer.ts", "node_modules/rxjs/src/internal/observable/merge.ts", "node_modules/rxjs/src/internal/observable/never.ts", "node_modules/rxjs/src/internal/util/argsOrArgArray.ts", "node_modules/rxjs/src/internal/operators/filter.ts", "node_modules/rxjs/src/internal/observable/zip.ts", "node_modules/rxjs/src/internal/operators/audit.ts", "node_modules/rxjs/src/internal/operators/auditTime.ts", "node_modules/rxjs/src/internal/operators/bufferCount.ts", "node_modules/rxjs/src/internal/operators/catchError.ts", "node_modules/rxjs/src/internal/operators/scanInternals.ts", "node_modules/rxjs/src/internal/operators/combineLatest.ts", "node_modules/rxjs/src/internal/operators/combineLatestWith.ts", "node_modules/rxjs/src/internal/operators/debounce.ts", "node_modules/rxjs/src/internal/operators/debounceTime.ts", "node_modules/rxjs/src/internal/operators/defaultIfEmpty.ts", "node_modules/rxjs/src/internal/operators/take.ts", "node_modules/rxjs/src/internal/operators/ignoreElements.ts", "node_modules/rxjs/src/internal/operators/mapTo.ts", "node_modules/rxjs/src/internal/operators/delayWhen.ts", "node_modules/rxjs/src/internal/operators/delay.ts", "node_modules/rxjs/src/internal/operators/distinctUntilChanged.ts", "node_modules/rxjs/src/internal/operators/distinctUntilKeyChanged.ts", "node_modules/rxjs/src/internal/operators/throwIfEmpty.ts", "node_modules/rxjs/src/internal/operators/endWith.ts", "node_modules/rxjs/src/internal/operators/finalize.ts", "node_modules/rxjs/src/internal/operators/first.ts", "node_modules/rxjs/src/internal/operators/takeLast.ts", "node_modules/rxjs/src/internal/operators/merge.ts", "node_modules/rxjs/src/internal/operators/mergeWith.ts", "node_modules/rxjs/src/internal/operators/repeat.ts", "node_modules/rxjs/src/internal/operators/scan.ts", "node_modules/rxjs/src/internal/operators/share.ts", "node_modules/rxjs/src/internal/operators/shareReplay.ts", "node_modules/rxjs/src/internal/operators/skip.ts", "node_modules/rxjs/src/internal/operators/skipUntil.ts", "node_modules/rxjs/src/internal/operators/startWith.ts", "node_modules/rxjs/src/internal/operators/switchMap.ts", "node_modules/rxjs/src/internal/operators/takeUntil.ts", "node_modules/rxjs/src/internal/operators/takeWhile.ts", "node_modules/rxjs/src/internal/operators/tap.ts", "node_modules/rxjs/src/internal/operators/throttle.ts", "node_modules/rxjs/src/internal/operators/throttleTime.ts", "node_modules/rxjs/src/internal/operators/withLatestFrom.ts", "node_modules/rxjs/src/internal/operators/zip.ts", "node_modules/rxjs/src/internal/operators/zipWith.ts", "src/templates/assets/javascripts/browser/document/index.ts", "src/templates/assets/javascripts/browser/element/_/index.ts", "src/templates/assets/javascripts/browser/element/focus/index.ts", "src/templates/assets/javascripts/browser/element/hover/index.ts", "src/templates/assets/javascripts/utilities/h/index.ts", "src/templates/assets/javascripts/utilities/round/index.ts", "src/templates/assets/javascripts/browser/script/index.ts", "src/templates/assets/javascripts/browser/element/size/_/index.ts", "src/templates/assets/javascripts/browser/element/size/content/index.ts", "src/templates/assets/javascripts/browser/element/offset/_/index.ts", "src/templates/assets/javascripts/browser/element/offset/content/index.ts", "src/templates/assets/javascripts/browser/element/visibility/index.ts", "src/templates/assets/javascripts/browser/toggle/index.ts", "src/templates/assets/javascripts/browser/keyboard/index.ts", "src/templates/assets/javascripts/browser/location/_/index.ts", "src/templates/assets/javascripts/browser/location/hash/index.ts", "src/templates/assets/javascripts/browser/media/index.ts", "src/templates/assets/javascripts/browser/request/index.ts", "src/templates/assets/javascripts/browser/viewport/offset/index.ts", "src/templates/assets/javascripts/browser/viewport/size/index.ts", "src/templates/assets/javascripts/browser/viewport/_/index.ts", "src/templates/assets/javascripts/browser/viewport/at/index.ts", "src/templates/assets/javascripts/browser/worker/index.ts", "src/templates/assets/javascripts/_/index.ts", "src/templates/assets/javascripts/components/_/index.ts", "src/templates/assets/javascripts/components/announce/index.ts", "src/templates/assets/javascripts/components/consent/index.ts", "src/templates/assets/javascripts/templates/tooltip/index.tsx", "src/templates/assets/javascripts/templates/annotation/index.tsx", "src/templates/assets/javascripts/templates/clipboard/index.tsx", "src/templates/assets/javascripts/templates/search/index.tsx", "src/templates/assets/javascripts/templates/source/index.tsx", "src/templates/assets/javascripts/templates/tabbed/index.tsx", "src/templates/assets/javascripts/templates/table/index.tsx", "src/templates/assets/javascripts/templates/version/index.tsx", "src/templates/assets/javascripts/components/tooltip2/index.ts", "src/templates/assets/javascripts/components/content/annotation/_/index.ts", "src/templates/assets/javascripts/components/content/annotation/list/index.ts", "src/templates/assets/javascripts/components/content/annotation/block/index.ts", "src/templates/assets/javascripts/components/content/code/_/index.ts", "src/templates/assets/javascripts/components/content/details/index.ts", "src/templates/assets/javascripts/components/content/mermaid/index.css", "src/templates/assets/javascripts/components/content/mermaid/index.ts", "src/templates/assets/javascripts/components/content/table/index.ts", "src/templates/assets/javascripts/components/content/tabs/index.ts", "src/templates/assets/javascripts/components/content/_/index.ts", "src/templates/assets/javascripts/components/dialog/index.ts", "src/templates/assets/javascripts/components/tooltip/index.ts", "src/templates/assets/javascripts/components/header/_/index.ts", "src/templates/assets/javascripts/components/header/title/index.ts", "src/templates/assets/javascripts/components/main/index.ts", "src/templates/assets/javascripts/components/palette/index.ts", "src/templates/assets/javascripts/components/progress/index.ts", "src/templates/assets/javascripts/integrations/clipboard/index.ts", "src/templates/assets/javascripts/integrations/sitemap/index.ts", "src/templates/assets/javascripts/integrations/instant/index.ts", "src/templates/assets/javascripts/integrations/search/highlighter/index.ts", "src/templates/assets/javascripts/integrations/search/worker/message/index.ts", "src/templates/assets/javascripts/integrations/search/worker/_/index.ts", "src/templates/assets/javascripts/integrations/version/index.ts", "src/templates/assets/javascripts/components/search/query/index.ts", "src/templates/assets/javascripts/components/search/result/index.ts", "src/templates/assets/javascripts/components/search/share/index.ts", "src/templates/assets/javascripts/components/search/suggest/index.ts", "src/templates/assets/javascripts/components/search/_/index.ts", "src/templates/assets/javascripts/components/search/highlight/index.ts", "src/templates/assets/javascripts/components/sidebar/index.ts", "src/templates/assets/javascripts/components/source/facts/github/index.ts", "src/templates/assets/javascripts/components/source/facts/gitlab/index.ts", "src/templates/assets/javascripts/components/source/facts/_/index.ts", "src/templates/assets/javascripts/components/source/_/index.ts", "src/templates/assets/javascripts/components/tabs/index.ts", "src/templates/assets/javascripts/components/toc/index.ts", "src/templates/assets/javascripts/components/top/index.ts", "src/templates/assets/javascripts/patches/ellipsis/index.ts", "src/templates/assets/javascripts/patches/indeterminate/index.ts", "src/templates/assets/javascripts/patches/scrollfix/index.ts", "src/templates/assets/javascripts/patches/scrolllock/index.ts", "src/templates/assets/javascripts/polyfills/index.ts"],
+  "sourcesContent": ["(function (global, factory) {\n  typeof exports === 'object' && typeof module !== 'undefined' ? factory() :\n  typeof define === 'function' && define.amd ? define(factory) :\n  (factory());\n}(this, (function () { 'use strict';\n\n  /**\n   * Applies the :focus-visible polyfill at the given scope.\n   * A scope in this case is either the top-level Document or a Shadow Root.\n   *\n   * @param {(Document|ShadowRoot)} scope\n   * @see https://github.com/WICG/focus-visible\n   */\n  function applyFocusVisiblePolyfill(scope) {\n    var hadKeyboardEvent = true;\n    var hadFocusVisibleRecently = false;\n    var hadFocusVisibleRecentlyTimeout = null;\n\n    var inputTypesAllowlist = {\n      text: true,\n      search: true,\n      url: true,\n      tel: true,\n      email: true,\n      password: true,\n      number: true,\n      date: true,\n      month: true,\n      week: true,\n      time: true,\n      datetime: true,\n      'datetime-local': true\n    };\n\n    /**\n     * Helper function for legacy browsers and iframes which sometimes focus\n     * elements like document, body, and non-interactive SVG.\n     * @param {Element} el\n     */\n    function isValidFocusTarget(el) {\n      if (\n        el &&\n        el !== document &&\n        el.nodeName !== 'HTML' &&\n        el.nodeName !== 'BODY' &&\n        'classList' in el &&\n        'contains' in el.classList\n      ) {\n        return true;\n      }\n      return false;\n    }\n\n    /**\n     * Computes whether the given element should automatically trigger the\n     * `focus-visible` class being added, i.e. whether it should always match\n     * `:focus-visible` when focused.\n     * @param {Element} el\n     * @return {boolean}\n     */\n    function focusTriggersKeyboardModality(el) {\n      var type = el.type;\n      var tagName = el.tagName;\n\n      if (tagName === 'INPUT' && inputTypesAllowlist[type] && !el.readOnly) {\n        return true;\n      }\n\n      if (tagName === 'TEXTAREA' && !el.readOnly) {\n        return true;\n      }\n\n      if (el.isContentEditable) {\n        return true;\n      }\n\n      return false;\n    }\n\n    /**\n     * Add the `focus-visible` class to the given element if it was not added by\n     * the author.\n     * @param {Element} el\n     */\n    function addFocusVisibleClass(el) {\n      if (el.classList.contains('focus-visible')) {\n        return;\n      }\n      el.classList.add('focus-visible');\n      el.setAttribute('data-focus-visible-added', '');\n    }\n\n    /**\n     * Remove the `focus-visible` class from the given element if it was not\n     * originally added by the author.\n     * @param {Element} el\n     */\n    function removeFocusVisibleClass(el) {\n      if (!el.hasAttribute('data-focus-visible-added')) {\n        return;\n      }\n      el.classList.remove('focus-visible');\n      el.removeAttribute('data-focus-visible-added');\n    }\n\n    /**\n     * If the most recent user interaction was via the keyboard;\n     * and the key press did not include a meta, alt/option, or control key;\n     * then the modality is keyboard. Otherwise, the modality is not keyboard.\n     * Apply `focus-visible` to any current active element and keep track\n     * of our keyboard modality state with `hadKeyboardEvent`.\n     * @param {KeyboardEvent} e\n     */\n    function onKeyDown(e) {\n      if (e.metaKey || e.altKey || e.ctrlKey) {\n        return;\n      }\n\n      if (isValidFocusTarget(scope.activeElement)) {\n        addFocusVisibleClass(scope.activeElement);\n      }\n\n      hadKeyboardEvent = true;\n    }\n\n    /**\n     * If at any point a user clicks with a pointing device, ensure that we change\n     * the modality away from keyboard.\n     * This avoids the situation where a user presses a key on an already focused\n     * element, and then clicks on a different element, focusing it with a\n     * pointing device, while we still think we're in keyboard modality.\n     * @param {Event} e\n     */\n    function onPointerDown(e) {\n      hadKeyboardEvent = false;\n    }\n\n    /**\n     * On `focus`, add the `focus-visible` class to the target if:\n     * - the target received focus as a result of keyboard navigation, or\n     * - the event target is an element that will likely require interaction\n     *   via the keyboard (e.g. a text box)\n     * @param {Event} e\n     */\n    function onFocus(e) {\n      // Prevent IE from focusing the document or HTML element.\n      if (!isValidFocusTarget(e.target)) {\n        return;\n      }\n\n      if (hadKeyboardEvent || focusTriggersKeyboardModality(e.target)) {\n        addFocusVisibleClass(e.target);\n      }\n    }\n\n    /**\n     * On `blur`, remove the `focus-visible` class from the target.\n     * @param {Event} e\n     */\n    function onBlur(e) {\n      if (!isValidFocusTarget(e.target)) {\n        return;\n      }\n\n      if (\n        e.target.classList.contains('focus-visible') ||\n        e.target.hasAttribute('data-focus-visible-added')\n      ) {\n        // To detect a tab/window switch, we look for a blur event followed\n        // rapidly by a visibility change.\n        // If we don't see a visibility change within 100ms, it's probably a\n        // regular focus change.\n        hadFocusVisibleRecently = true;\n        window.clearTimeout(hadFocusVisibleRecentlyTimeout);\n        hadFocusVisibleRecentlyTimeout = window.setTimeout(function() {\n          hadFocusVisibleRecently = false;\n        }, 100);\n        removeFocusVisibleClass(e.target);\n      }\n    }\n\n    /**\n     * If the user changes tabs, keep track of whether or not the previously\n     * focused element had .focus-visible.\n     * @param {Event} e\n     */\n    function onVisibilityChange(e) {\n      if (document.visibilityState === 'hidden') {\n        // If the tab becomes active again, the browser will handle calling focus\n        // on the element (Safari actually calls it twice).\n        // If this tab change caused a blur on an element with focus-visible,\n        // re-apply the class when the user switches back to the tab.\n        if (hadFocusVisibleRecently) {\n          hadKeyboardEvent = true;\n        }\n        addInitialPointerMoveListeners();\n      }\n    }\n\n    /**\n     * Add a group of listeners to detect usage of any pointing devices.\n     * These listeners will be added when the polyfill first loads, and anytime\n     * the window is blurred, so that they are active when the window regains\n     * focus.\n     */\n    function addInitialPointerMoveListeners() {\n      document.addEventListener('mousemove', onInitialPointerMove);\n      document.addEventListener('mousedown', onInitialPointerMove);\n      document.addEventListener('mouseup', onInitialPointerMove);\n      document.addEventListener('pointermove', onInitialPointerMove);\n      document.addEventListener('pointerdown', onInitialPointerMove);\n      document.addEventListener('pointerup', onInitialPointerMove);\n      document.addEventListener('touchmove', onInitialPointerMove);\n      document.addEventListener('touchstart', onInitialPointerMove);\n      document.addEventListener('touchend', onInitialPointerMove);\n    }\n\n    function removeInitialPointerMoveListeners() {\n      document.removeEventListener('mousemove', onInitialPointerMove);\n      document.removeEventListener('mousedown', onInitialPointerMove);\n      document.removeEventListener('mouseup', onInitialPointerMove);\n      document.removeEventListener('pointermove', onInitialPointerMove);\n      document.removeEventListener('pointerdown', onInitialPointerMove);\n      document.removeEventListener('pointerup', onInitialPointerMove);\n      document.removeEventListener('touchmove', onInitialPointerMove);\n      document.removeEventListener('touchstart', onInitialPointerMove);\n      document.removeEventListener('touchend', onInitialPointerMove);\n    }\n\n    /**\n     * When the polfyill first loads, assume the user is in keyboard modality.\n     * If any event is received from a pointing device (e.g. mouse, pointer,\n     * touch), turn off keyboard modality.\n     * This accounts for situations where focus enters the page from the URL bar.\n     * @param {Event} e\n     */\n    function onInitialPointerMove(e) {\n      // Work around a Safari quirk that fires a mousemove on <html> whenever the\n      // window blurs, even if you're tabbing out of the page. \u00AF\\_(\u30C4)_/\u00AF\n      if (e.target.nodeName && e.target.nodeName.toLowerCase() === 'html') {\n        return;\n      }\n\n      hadKeyboardEvent = false;\n      removeInitialPointerMoveListeners();\n    }\n\n    // For some kinds of state, we are interested in changes at the global scope\n    // only. For example, global pointer input, global key presses and global\n    // visibility change should affect the state at every scope:\n    document.addEventListener('keydown', onKeyDown, true);\n    document.addEventListener('mousedown', onPointerDown, true);\n    document.addEventListener('pointerdown', onPointerDown, true);\n    document.addEventListener('touchstart', onPointerDown, true);\n    document.addEventListener('visibilitychange', onVisibilityChange, true);\n\n    addInitialPointerMoveListeners();\n\n    // For focus and blur, we specifically care about state changes in the local\n    // scope. This is because focus / blur events that originate from within a\n    // shadow root are not re-dispatched from the host element if it was already\n    // the active element in its own scope:\n    scope.addEventListener('focus', onFocus, true);\n    scope.addEventListener('blur', onBlur, true);\n\n    // We detect that a node is a ShadowRoot by ensuring that it is a\n    // DocumentFragment and also has a host property. This check covers native\n    // implementation and polyfill implementation transparently. If we only cared\n    // about the native implementation, we could just check if the scope was\n    // an instance of a ShadowRoot.\n    if (scope.nodeType === Node.DOCUMENT_FRAGMENT_NODE && scope.host) {\n      // Since a ShadowRoot is a special kind of DocumentFragment, it does not\n      // have a root element to add a class to. So, we add this attribute to the\n      // host element instead:\n      scope.host.setAttribute('data-js-focus-visible', '');\n    } else if (scope.nodeType === Node.DOCUMENT_NODE) {\n      document.documentElement.classList.add('js-focus-visible');\n      document.documentElement.setAttribute('data-js-focus-visible', '');\n    }\n  }\n\n  // It is important to wrap all references to global window and document in\n  // these checks to support server-side rendering use cases\n  // @see https://github.com/WICG/focus-visible/issues/199\n  if (typeof window !== 'undefined' && typeof document !== 'undefined') {\n    // Make the polyfill helper globally available. This can be used as a signal\n    // to interested libraries that wish to coordinate with the polyfill for e.g.,\n    // applying the polyfill to a shadow root:\n    window.applyFocusVisiblePolyfill = applyFocusVisiblePolyfill;\n\n    // Notify interested libraries of the polyfill's presence, in case the\n    // polyfill was loaded lazily:\n    var event;\n\n    try {\n      event = new CustomEvent('focus-visible-polyfill-ready');\n    } catch (error) {\n      // IE11 does not support using CustomEvent as a constructor directly:\n      event = document.createEvent('CustomEvent');\n      event.initCustomEvent('focus-visible-polyfill-ready', false, false, {});\n    }\n\n    window.dispatchEvent(event);\n  }\n\n  if (typeof document !== 'undefined') {\n    // Apply the polyfill to the global document, so that no JavaScript\n    // coordination is required to use the polyfill in the top-level document:\n    applyFocusVisiblePolyfill(document);\n  }\n\n})));\n", "/*!\n * clipboard.js v2.0.11\n * https://clipboardjs.com/\n *\n * Licensed MIT \u00A9 Zeno Rocha\n */\n(function webpackUniversalModuleDefinition(root, factory) {\n\tif(typeof exports === 'object' && typeof module === 'object')\n\t\tmodule.exports = factory();\n\telse if(typeof define === 'function' && define.amd)\n\t\tdefine([], factory);\n\telse if(typeof exports === 'object')\n\t\texports[\"ClipboardJS\"] = factory();\n\telse\n\t\troot[\"ClipboardJS\"] = factory();\n})(this, function() {\nreturn /******/ (function() { // webpackBootstrap\n/******/ \tvar __webpack_modules__ = ({\n\n/***/ 686:\n/***/ (function(__unused_webpack_module, __webpack_exports__, __webpack_require__) {\n\n\"use strict\";\n\n// EXPORTS\n__webpack_require__.d(__webpack_exports__, {\n  \"default\": function() { return /* binding */ clipboard; }\n});\n\n// EXTERNAL MODULE: ./node_modules/tiny-emitter/index.js\nvar tiny_emitter = __webpack_require__(279);\nvar tiny_emitter_default = /*#__PURE__*/__webpack_require__.n(tiny_emitter);\n// EXTERNAL MODULE: ./node_modules/good-listener/src/listen.js\nvar listen = __webpack_require__(370);\nvar listen_default = /*#__PURE__*/__webpack_require__.n(listen);\n// EXTERNAL MODULE: ./node_modules/select/src/select.js\nvar src_select = __webpack_require__(817);\nvar select_default = /*#__PURE__*/__webpack_require__.n(src_select);\n;// CONCATENATED MODULE: ./src/common/command.js\n/**\n * Executes a given operation type.\n * @param {String} type\n * @return {Boolean}\n */\nfunction command(type) {\n  try {\n    return document.execCommand(type);\n  } catch (err) {\n    return false;\n  }\n}\n;// CONCATENATED MODULE: ./src/actions/cut.js\n\n\n/**\n * Cut action wrapper.\n * @param {String|HTMLElement} target\n * @return {String}\n */\n\nvar ClipboardActionCut = function ClipboardActionCut(target) {\n  var selectedText = select_default()(target);\n  command('cut');\n  return selectedText;\n};\n\n/* harmony default export */ var actions_cut = (ClipboardActionCut);\n;// CONCATENATED MODULE: ./src/common/create-fake-element.js\n/**\n * Creates a fake textarea element with a value.\n * @param {String} value\n * @return {HTMLElement}\n */\nfunction createFakeElement(value) {\n  var isRTL = document.documentElement.getAttribute('dir') === 'rtl';\n  var fakeElement = document.createElement('textarea'); // Prevent zooming on iOS\n\n  fakeElement.style.fontSize = '12pt'; // Reset box model\n\n  fakeElement.style.border = '0';\n  fakeElement.style.padding = '0';\n  fakeElement.style.margin = '0'; // Move element out of screen horizontally\n\n  fakeElement.style.position = 'absolute';\n  fakeElement.style[isRTL ? 'right' : 'left'] = '-9999px'; // Move element to the same position vertically\n\n  var yPosition = window.pageYOffset || document.documentElement.scrollTop;\n  fakeElement.style.top = \"\".concat(yPosition, \"px\");\n  fakeElement.setAttribute('readonly', '');\n  fakeElement.value = value;\n  return fakeElement;\n}\n;// CONCATENATED MODULE: ./src/actions/copy.js\n\n\n\n/**\n * Create fake copy action wrapper using a fake element.\n * @param {String} target\n * @param {Object} options\n * @return {String}\n */\n\nvar fakeCopyAction = function fakeCopyAction(value, options) {\n  var fakeElement = createFakeElement(value);\n  options.container.appendChild(fakeElement);\n  var selectedText = select_default()(fakeElement);\n  command('copy');\n  fakeElement.remove();\n  return selectedText;\n};\n/**\n * Copy action wrapper.\n * @param {String|HTMLElement} target\n * @param {Object} options\n * @return {String}\n */\n\n\nvar ClipboardActionCopy = function ClipboardActionCopy(target) {\n  var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {\n    container: document.body\n  };\n  var selectedText = '';\n\n  if (typeof target === 'string') {\n    selectedText = fakeCopyAction(target, options);\n  } else if (target instanceof HTMLInputElement && !['text', 'search', 'url', 'tel', 'password'].includes(target === null || target === void 0 ? void 0 : target.type)) {\n    // If input type doesn't support `setSelectionRange`. Simulate it. https://developer.mozilla.org/en-US/docs/Web/API/HTMLInputElement/setSelectionRange\n    selectedText = fakeCopyAction(target.value, options);\n  } else {\n    selectedText = select_default()(target);\n    command('copy');\n  }\n\n  return selectedText;\n};\n\n/* harmony default export */ var actions_copy = (ClipboardActionCopy);\n;// CONCATENATED MODULE: ./src/actions/default.js\nfunction _typeof(obj) { \"@babel/helpers - typeof\"; if (typeof Symbol === \"function\" && typeof Symbol.iterator === \"symbol\") { _typeof = function _typeof(obj) { return typeof obj; }; } else { _typeof = function _typeof(obj) { return obj && typeof Symbol === \"function\" && obj.constructor === Symbol && obj !== Symbol.prototype ? \"symbol\" : typeof obj; }; } return _typeof(obj); }\n\n\n\n/**\n * Inner function which performs selection from either `text` or `target`\n * properties and then executes copy or cut operations.\n * @param {Object} options\n */\n\nvar ClipboardActionDefault = function ClipboardActionDefault() {\n  var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};\n  // Defines base properties passed from constructor.\n  var _options$action = options.action,\n      action = _options$action === void 0 ? 'copy' : _options$action,\n      container = options.container,\n      target = options.target,\n      text = options.text; // Sets the `action` to be performed which can be either 'copy' or 'cut'.\n\n  if (action !== 'copy' && action !== 'cut') {\n    throw new Error('Invalid \"action\" value, use either \"copy\" or \"cut\"');\n  } // Sets the `target` property using an element that will be have its content copied.\n\n\n  if (target !== undefined) {\n    if (target && _typeof(target) === 'object' && target.nodeType === 1) {\n      if (action === 'copy' && target.hasAttribute('disabled')) {\n        throw new Error('Invalid \"target\" attribute. Please use \"readonly\" instead of \"disabled\" attribute');\n      }\n\n      if (action === 'cut' && (target.hasAttribute('readonly') || target.hasAttribute('disabled'))) {\n        throw new Error('Invalid \"target\" attribute. You can\\'t cut text from elements with \"readonly\" or \"disabled\" attributes');\n      }\n    } else {\n      throw new Error('Invalid \"target\" value, use a valid Element');\n    }\n  } // Define selection strategy based on `text` property.\n\n\n  if (text) {\n    return actions_copy(text, {\n      container: container\n    });\n  } // Defines which selection strategy based on `target` property.\n\n\n  if (target) {\n    return action === 'cut' ? actions_cut(target) : actions_copy(target, {\n      container: container\n    });\n  }\n};\n\n/* harmony default export */ var actions_default = (ClipboardActionDefault);\n;// CONCATENATED MODULE: ./src/clipboard.js\nfunction clipboard_typeof(obj) { \"@babel/helpers - typeof\"; if (typeof Symbol === \"function\" && typeof Symbol.iterator === \"symbol\") { clipboard_typeof = function _typeof(obj) { return typeof obj; }; } else { clipboard_typeof = function _typeof(obj) { return obj && typeof Symbol === \"function\" && obj.constructor === Symbol && obj !== Symbol.prototype ? \"symbol\" : typeof obj; }; } return clipboard_typeof(obj); }\n\nfunction _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError(\"Cannot call a class as a function\"); } }\n\nfunction _defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if (\"value\" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } }\n\nfunction _createClass(Constructor, protoProps, staticProps) { if (protoProps) _defineProperties(Constructor.prototype, protoProps); if (staticProps) _defineProperties(Constructor, staticProps); return Constructor; }\n\nfunction _inherits(subClass, superClass) { if (typeof superClass !== \"function\" && superClass !== null) { throw new TypeError(\"Super expression must either be null or a function\"); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, writable: true, configurable: true } }); if (superClass) _setPrototypeOf(subClass, superClass); }\n\nfunction _setPrototypeOf(o, p) { _setPrototypeOf = Object.setPrototypeOf || function _setPrototypeOf(o, p) { o.__proto__ = p; return o; }; return _setPrototypeOf(o, p); }\n\nfunction _createSuper(Derived) { var hasNativeReflectConstruct = _isNativeReflectConstruct(); return function _createSuperInternal() { var Super = _getPrototypeOf(Derived), result; if (hasNativeReflectConstruct) { var NewTarget = _getPrototypeOf(this).constructor; result = Reflect.construct(Super, arguments, NewTarget); } else { result = Super.apply(this, arguments); } return _possibleConstructorReturn(this, result); }; }\n\nfunction _possibleConstructorReturn(self, call) { if (call && (clipboard_typeof(call) === \"object\" || typeof call === \"function\")) { return call; } return _assertThisInitialized(self); }\n\nfunction _assertThisInitialized(self) { if (self === void 0) { throw new ReferenceError(\"this hasn't been initialised - super() hasn't been called\"); } return self; }\n\nfunction _isNativeReflectConstruct() { if (typeof Reflect === \"undefined\" || !Reflect.construct) return false; if (Reflect.construct.sham) return false; if (typeof Proxy === \"function\") return true; try { Date.prototype.toString.call(Reflect.construct(Date, [], function () {})); return true; } catch (e) { return false; } }\n\nfunction _getPrototypeOf(o) { _getPrototypeOf = Object.setPrototypeOf ? Object.getPrototypeOf : function _getPrototypeOf(o) { return o.__proto__ || Object.getPrototypeOf(o); }; return _getPrototypeOf(o); }\n\n\n\n\n\n\n/**\n * Helper function to retrieve attribute value.\n * @param {String} suffix\n * @param {Element} element\n */\n\nfunction getAttributeValue(suffix, element) {\n  var attribute = \"data-clipboard-\".concat(suffix);\n\n  if (!element.hasAttribute(attribute)) {\n    return;\n  }\n\n  return element.getAttribute(attribute);\n}\n/**\n * Base class which takes one or more elements, adds event listeners to them,\n * and instantiates a new `ClipboardAction` on each click.\n */\n\n\nvar Clipboard = /*#__PURE__*/function (_Emitter) {\n  _inherits(Clipboard, _Emitter);\n\n  var _super = _createSuper(Clipboard);\n\n  /**\n   * @param {String|HTMLElement|HTMLCollection|NodeList} trigger\n   * @param {Object} options\n   */\n  function Clipboard(trigger, options) {\n    var _this;\n\n    _classCallCheck(this, Clipboard);\n\n    _this = _super.call(this);\n\n    _this.resolveOptions(options);\n\n    _this.listenClick(trigger);\n\n    return _this;\n  }\n  /**\n   * Defines if attributes would be resolved using internal setter functions\n   * or custom functions that were passed in the constructor.\n   * @param {Object} options\n   */\n\n\n  _createClass(Clipboard, [{\n    key: \"resolveOptions\",\n    value: function resolveOptions() {\n      var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};\n      this.action = typeof options.action === 'function' ? options.action : this.defaultAction;\n      this.target = typeof options.target === 'function' ? options.target : this.defaultTarget;\n      this.text = typeof options.text === 'function' ? options.text : this.defaultText;\n      this.container = clipboard_typeof(options.container) === 'object' ? options.container : document.body;\n    }\n    /**\n     * Adds a click event listener to the passed trigger.\n     * @param {String|HTMLElement|HTMLCollection|NodeList} trigger\n     */\n\n  }, {\n    key: \"listenClick\",\n    value: function listenClick(trigger) {\n      var _this2 = this;\n\n      this.listener = listen_default()(trigger, 'click', function (e) {\n        return _this2.onClick(e);\n      });\n    }\n    /**\n     * Defines a new `ClipboardAction` on each click event.\n     * @param {Event} e\n     */\n\n  }, {\n    key: \"onClick\",\n    value: function onClick(e) {\n      var trigger = e.delegateTarget || e.currentTarget;\n      var action = this.action(trigger) || 'copy';\n      var text = actions_default({\n        action: action,\n        container: this.container,\n        target: this.target(trigger),\n        text: this.text(trigger)\n      }); // Fires an event based on the copy operation result.\n\n      this.emit(text ? 'success' : 'error', {\n        action: action,\n        text: text,\n        trigger: trigger,\n        clearSelection: function clearSelection() {\n          if (trigger) {\n            trigger.focus();\n          }\n\n          window.getSelection().removeAllRanges();\n        }\n      });\n    }\n    /**\n     * Default `action` lookup function.\n     * @param {Element} trigger\n     */\n\n  }, {\n    key: \"defaultAction\",\n    value: function defaultAction(trigger) {\n      return getAttributeValue('action', trigger);\n    }\n    /**\n     * Default `target` lookup function.\n     * @param {Element} trigger\n     */\n\n  }, {\n    key: \"defaultTarget\",\n    value: function defaultTarget(trigger) {\n      var selector = getAttributeValue('target', trigger);\n\n      if (selector) {\n        return document.querySelector(selector);\n      }\n    }\n    /**\n     * Allow fire programmatically a copy action\n     * @param {String|HTMLElement} target\n     * @param {Object} options\n     * @returns Text copied.\n     */\n\n  }, {\n    key: \"defaultText\",\n\n    /**\n     * Default `text` lookup function.\n     * @param {Element} trigger\n     */\n    value: function defaultText(trigger) {\n      return getAttributeValue('text', trigger);\n    }\n    /**\n     * Destroy lifecycle.\n     */\n\n  }, {\n    key: \"destroy\",\n    value: function destroy() {\n      this.listener.destroy();\n    }\n  }], [{\n    key: \"copy\",\n    value: function copy(target) {\n      var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {\n        container: document.body\n      };\n      return actions_copy(target, options);\n    }\n    /**\n     * Allow fire programmatically a cut action\n     * @param {String|HTMLElement} target\n     * @returns Text cutted.\n     */\n\n  }, {\n    key: \"cut\",\n    value: function cut(target) {\n      return actions_cut(target);\n    }\n    /**\n     * Returns the support of the given action, or all actions if no action is\n     * given.\n     * @param {String} [action]\n     */\n\n  }, {\n    key: \"isSupported\",\n    value: function isSupported() {\n      var action = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : ['copy', 'cut'];\n      var actions = typeof action === 'string' ? [action] : action;\n      var support = !!document.queryCommandSupported;\n      actions.forEach(function (action) {\n        support = support && !!document.queryCommandSupported(action);\n      });\n      return support;\n    }\n  }]);\n\n  return Clipboard;\n}((tiny_emitter_default()));\n\n/* harmony default export */ var clipboard = (Clipboard);\n\n/***/ }),\n\n/***/ 828:\n/***/ (function(module) {\n\nvar DOCUMENT_NODE_TYPE = 9;\n\n/**\n * A polyfill for Element.matches()\n */\nif (typeof Element !== 'undefined' && !Element.prototype.matches) {\n    var proto = Element.prototype;\n\n    proto.matches = proto.matchesSelector ||\n                    proto.mozMatchesSelector ||\n                    proto.msMatchesSelector ||\n                    proto.oMatchesSelector ||\n                    proto.webkitMatchesSelector;\n}\n\n/**\n * Finds the closest parent that matches a selector.\n *\n * @param {Element} element\n * @param {String} selector\n * @return {Function}\n */\nfunction closest (element, selector) {\n    while (element && element.nodeType !== DOCUMENT_NODE_TYPE) {\n        if (typeof element.matches === 'function' &&\n            element.matches(selector)) {\n          return element;\n        }\n        element = element.parentNode;\n    }\n}\n\nmodule.exports = closest;\n\n\n/***/ }),\n\n/***/ 438:\n/***/ (function(module, __unused_webpack_exports, __webpack_require__) {\n\nvar closest = __webpack_require__(828);\n\n/**\n * Delegates event to a selector.\n *\n * @param {Element} element\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @param {Boolean} useCapture\n * @return {Object}\n */\nfunction _delegate(element, selector, type, callback, useCapture) {\n    var listenerFn = listener.apply(this, arguments);\n\n    element.addEventListener(type, listenerFn, useCapture);\n\n    return {\n        destroy: function() {\n            element.removeEventListener(type, listenerFn, useCapture);\n        }\n    }\n}\n\n/**\n * Delegates event to a selector.\n *\n * @param {Element|String|Array} [elements]\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @param {Boolean} useCapture\n * @return {Object}\n */\nfunction delegate(elements, selector, type, callback, useCapture) {\n    // Handle the regular Element usage\n    if (typeof elements.addEventListener === 'function') {\n        return _delegate.apply(null, arguments);\n    }\n\n    // Handle Element-less usage, it defaults to global delegation\n    if (typeof type === 'function') {\n        // Use `document` as the first parameter, then apply arguments\n        // This is a short way to .unshift `arguments` without running into deoptimizations\n        return _delegate.bind(null, document).apply(null, arguments);\n    }\n\n    // Handle Selector-based usage\n    if (typeof elements === 'string') {\n        elements = document.querySelectorAll(elements);\n    }\n\n    // Handle Array-like based usage\n    return Array.prototype.map.call(elements, function (element) {\n        return _delegate(element, selector, type, callback, useCapture);\n    });\n}\n\n/**\n * Finds closest match and invokes callback.\n *\n * @param {Element} element\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @return {Function}\n */\nfunction listener(element, selector, type, callback) {\n    return function(e) {\n        e.delegateTarget = closest(e.target, selector);\n\n        if (e.delegateTarget) {\n            callback.call(element, e);\n        }\n    }\n}\n\nmodule.exports = delegate;\n\n\n/***/ }),\n\n/***/ 879:\n/***/ (function(__unused_webpack_module, exports) {\n\n/**\n * Check if argument is a HTML element.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.node = function(value) {\n    return value !== undefined\n        && value instanceof HTMLElement\n        && value.nodeType === 1;\n};\n\n/**\n * Check if argument is a list of HTML elements.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.nodeList = function(value) {\n    var type = Object.prototype.toString.call(value);\n\n    return value !== undefined\n        && (type === '[object NodeList]' || type === '[object HTMLCollection]')\n        && ('length' in value)\n        && (value.length === 0 || exports.node(value[0]));\n};\n\n/**\n * Check if argument is a string.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.string = function(value) {\n    return typeof value === 'string'\n        || value instanceof String;\n};\n\n/**\n * Check if argument is a function.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.fn = function(value) {\n    var type = Object.prototype.toString.call(value);\n\n    return type === '[object Function]';\n};\n\n\n/***/ }),\n\n/***/ 370:\n/***/ (function(module, __unused_webpack_exports, __webpack_require__) {\n\nvar is = __webpack_require__(879);\nvar delegate = __webpack_require__(438);\n\n/**\n * Validates all params and calls the right\n * listener function based on its target type.\n *\n * @param {String|HTMLElement|HTMLCollection|NodeList} target\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listen(target, type, callback) {\n    if (!target && !type && !callback) {\n        throw new Error('Missing required arguments');\n    }\n\n    if (!is.string(type)) {\n        throw new TypeError('Second argument must be a String');\n    }\n\n    if (!is.fn(callback)) {\n        throw new TypeError('Third argument must be a Function');\n    }\n\n    if (is.node(target)) {\n        return listenNode(target, type, callback);\n    }\n    else if (is.nodeList(target)) {\n        return listenNodeList(target, type, callback);\n    }\n    else if (is.string(target)) {\n        return listenSelector(target, type, callback);\n    }\n    else {\n        throw new TypeError('First argument must be a String, HTMLElement, HTMLCollection, or NodeList');\n    }\n}\n\n/**\n * Adds an event listener to a HTML element\n * and returns a remove listener function.\n *\n * @param {HTMLElement} node\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenNode(node, type, callback) {\n    node.addEventListener(type, callback);\n\n    return {\n        destroy: function() {\n            node.removeEventListener(type, callback);\n        }\n    }\n}\n\n/**\n * Add an event listener to a list of HTML elements\n * and returns a remove listener function.\n *\n * @param {NodeList|HTMLCollection} nodeList\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenNodeList(nodeList, type, callback) {\n    Array.prototype.forEach.call(nodeList, function(node) {\n        node.addEventListener(type, callback);\n    });\n\n    return {\n        destroy: function() {\n            Array.prototype.forEach.call(nodeList, function(node) {\n                node.removeEventListener(type, callback);\n            });\n        }\n    }\n}\n\n/**\n * Add an event listener to a selector\n * and returns a remove listener function.\n *\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenSelector(selector, type, callback) {\n    return delegate(document.body, selector, type, callback);\n}\n\nmodule.exports = listen;\n\n\n/***/ }),\n\n/***/ 817:\n/***/ (function(module) {\n\nfunction select(element) {\n    var selectedText;\n\n    if (element.nodeName === 'SELECT') {\n        element.focus();\n\n        selectedText = element.value;\n    }\n    else if (element.nodeName === 'INPUT' || element.nodeName === 'TEXTAREA') {\n        var isReadOnly = element.hasAttribute('readonly');\n\n        if (!isReadOnly) {\n            element.setAttribute('readonly', '');\n        }\n\n        element.select();\n        element.setSelectionRange(0, element.value.length);\n\n        if (!isReadOnly) {\n            element.removeAttribute('readonly');\n        }\n\n        selectedText = element.value;\n    }\n    else {\n        if (element.hasAttribute('contenteditable')) {\n            element.focus();\n        }\n\n        var selection = window.getSelection();\n        var range = document.createRange();\n\n        range.selectNodeContents(element);\n        selection.removeAllRanges();\n        selection.addRange(range);\n\n        selectedText = selection.toString();\n    }\n\n    return selectedText;\n}\n\nmodule.exports = select;\n\n\n/***/ }),\n\n/***/ 279:\n/***/ (function(module) {\n\nfunction E () {\n  // Keep this empty so it's easier to inherit from\n  // (via https://github.com/lipsmack from https://github.com/scottcorgan/tiny-emitter/issues/3)\n}\n\nE.prototype = {\n  on: function (name, callback, ctx) {\n    var e = this.e || (this.e = {});\n\n    (e[name] || (e[name] = [])).push({\n      fn: callback,\n      ctx: ctx\n    });\n\n    return this;\n  },\n\n  once: function (name, callback, ctx) {\n    var self = this;\n    function listener () {\n      self.off(name, listener);\n      callback.apply(ctx, arguments);\n    };\n\n    listener._ = callback\n    return this.on(name, listener, ctx);\n  },\n\n  emit: function (name) {\n    var data = [].slice.call(arguments, 1);\n    var evtArr = ((this.e || (this.e = {}))[name] || []).slice();\n    var i = 0;\n    var len = evtArr.length;\n\n    for (i; i < len; i++) {\n      evtArr[i].fn.apply(evtArr[i].ctx, data);\n    }\n\n    return this;\n  },\n\n  off: function (name, callback) {\n    var e = this.e || (this.e = {});\n    var evts = e[name];\n    var liveEvents = [];\n\n    if (evts && callback) {\n      for (var i = 0, len = evts.length; i < len; i++) {\n        if (evts[i].fn !== callback && evts[i].fn._ !== callback)\n          liveEvents.push(evts[i]);\n      }\n    }\n\n    // Remove event from queue to prevent memory leak\n    // Suggested by https://github.com/lazd\n    // Ref: https://github.com/scottcorgan/tiny-emitter/commit/c6ebfaa9bc973b33d110a84a307742b7cf94c953#commitcomment-5024910\n\n    (liveEvents.length)\n      ? e[name] = liveEvents\n      : delete e[name];\n\n    return this;\n  }\n};\n\nmodule.exports = E;\nmodule.exports.TinyEmitter = E;\n\n\n/***/ })\n\n/******/ \t});\n/************************************************************************/\n/******/ \t// The module cache\n/******/ \tvar __webpack_module_cache__ = {};\n/******/ \t\n/******/ \t// The require function\n/******/ \tfunction __webpack_require__(moduleId) {\n/******/ \t\t// Check if module is in cache\n/******/ \t\tif(__webpack_module_cache__[moduleId]) {\n/******/ \t\t\treturn __webpack_module_cache__[moduleId].exports;\n/******/ \t\t}\n/******/ \t\t// Create a new module (and put it into the cache)\n/******/ \t\tvar module = __webpack_module_cache__[moduleId] = {\n/******/ \t\t\t// no module.id needed\n/******/ \t\t\t// no module.loaded needed\n/******/ \t\t\texports: {}\n/******/ \t\t};\n/******/ \t\n/******/ \t\t// Execute the module function\n/******/ \t\t__webpack_modules__[moduleId](module, module.exports, __webpack_require__);\n/******/ \t\n/******/ \t\t// Return the exports of the module\n/******/ \t\treturn module.exports;\n/******/ \t}\n/******/ \t\n/************************************************************************/\n/******/ \t/* webpack/runtime/compat get default export */\n/******/ \t!function() {\n/******/ \t\t// getDefaultExport function for compatibility with non-harmony modules\n/******/ \t\t__webpack_require__.n = function(module) {\n/******/ \t\t\tvar getter = module && module.__esModule ?\n/******/ \t\t\t\tfunction() { return module['default']; } :\n/******/ \t\t\t\tfunction() { return module; };\n/******/ \t\t\t__webpack_require__.d(getter, { a: getter });\n/******/ \t\t\treturn getter;\n/******/ \t\t};\n/******/ \t}();\n/******/ \t\n/******/ \t/* webpack/runtime/define property getters */\n/******/ \t!function() {\n/******/ \t\t// define getter functions for harmony exports\n/******/ \t\t__webpack_require__.d = function(exports, definition) {\n/******/ \t\t\tfor(var key in definition) {\n/******/ \t\t\t\tif(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n/******/ \t\t\t\t\tObject.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n/******/ \t\t\t\t}\n/******/ \t\t\t}\n/******/ \t\t};\n/******/ \t}();\n/******/ \t\n/******/ \t/* webpack/runtime/hasOwnProperty shorthand */\n/******/ \t!function() {\n/******/ \t\t__webpack_require__.o = function(obj, prop) { return Object.prototype.hasOwnProperty.call(obj, prop); }\n/******/ \t}();\n/******/ \t\n/************************************************************************/\n/******/ \t// module exports must be returned from runtime so entry inlining is disabled\n/******/ \t// startup\n/******/ \t// Load entry module and return exports\n/******/ \treturn __webpack_require__(686);\n/******/ })()\n.default;\n});", "/*!\n * escape-html\n * Copyright(c) 2012-2013 TJ Holowaychuk\n * Copyright(c) 2015 Andreas Lubbe\n * Copyright(c) 2015 Tiancheng \"Timothy\" Gu\n * MIT Licensed\n */\n\n'use strict';\n\n/**\n * Module variables.\n * @private\n */\n\nvar matchHtmlRegExp = /[\"'&<>]/;\n\n/**\n * Module exports.\n * @public\n */\n\nmodule.exports = escapeHtml;\n\n/**\n * Escape special characters in the given string of html.\n *\n * @param  {string} string The string to escape for inserting into HTML\n * @return {string}\n * @public\n */\n\nfunction escapeHtml(string) {\n  var str = '' + string;\n  var match = matchHtmlRegExp.exec(str);\n\n  if (!match) {\n    return str;\n  }\n\n  var escape;\n  var html = '';\n  var index = 0;\n  var lastIndex = 0;\n\n  for (index = match.index; index < str.length; index++) {\n    switch (str.charCodeAt(index)) {\n      case 34: // \"\n        escape = '&quot;';\n        break;\n      case 38: // &\n        escape = '&amp;';\n        break;\n      case 39: // '\n        escape = '&#39;';\n        break;\n      case 60: // <\n        escape = '&lt;';\n        break;\n      case 62: // >\n        escape = '&gt;';\n        break;\n      default:\n        continue;\n    }\n\n    if (lastIndex !== index) {\n      html += str.substring(lastIndex, index);\n    }\n\n    lastIndex = index + 1;\n    html += escape;\n  }\n\n  return lastIndex !== index\n    ? html + str.substring(lastIndex, index)\n    : html;\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport \"focus-visible\"\n\nimport {\n  EMPTY,\n  NEVER,\n  Observable,\n  Subject,\n  defer,\n  delay,\n  filter,\n  map,\n  merge,\n  mergeWith,\n  shareReplay,\n  switchMap\n} from \"rxjs\"\n\nimport { configuration, feature } from \"./_\"\nimport {\n  at,\n  getActiveElement,\n  getOptionalElement,\n  requestJSON,\n  setLocation,\n  setToggle,\n  watchDocument,\n  watchKeyboard,\n  watchLocation,\n  watchLocationTarget,\n  watchMedia,\n  watchPrint,\n  watchScript,\n  watchViewport\n} from \"./browser\"\nimport {\n  getComponentElement,\n  getComponentElements,\n  mountAnnounce,\n  mountBackToTop,\n  mountConsent,\n  mountContent,\n  mountDialog,\n  mountHeader,\n  mountHeaderTitle,\n  mountPalette,\n  mountProgress,\n  mountSearch,\n  mountSearchHiglight,\n  mountSidebar,\n  mountSource,\n  mountTableOfContents,\n  mountTabs,\n  watchHeader,\n  watchMain\n} from \"./components\"\nimport {\n  SearchIndex,\n  setupClipboardJS,\n  setupInstantNavigation,\n  setupVersionSelector\n} from \"./integrations\"\nimport {\n  patchEllipsis,\n  patchIndeterminate,\n  patchScrollfix,\n  patchScrolllock\n} from \"./patches\"\nimport \"./polyfills\"\n\n/* ----------------------------------------------------------------------------\n * Functions - @todo refactor\n * ------------------------------------------------------------------------- */\n\n/**\n * Fetch search index\n *\n * @returns Search index observable\n */\nfunction fetchSearchIndex(): Observable<SearchIndex> {\n  if (location.protocol === \"file:\") {\n    return watchScript(\n      `${new URL(\"search/search_index.js\", config.base)}`\n    )\n      .pipe(\n        // @ts-ignore - @todo fix typings\n        map(() => __index),\n        shareReplay(1)\n      )\n  } else {\n    return requestJSON<SearchIndex>(\n      new URL(\"search/search_index.json\", config.base)\n    )\n  }\n}\n\n/* ----------------------------------------------------------------------------\n * Application\n * ------------------------------------------------------------------------- */\n\n/* Yay, JavaScript is available */\ndocument.documentElement.classList.remove(\"no-js\")\ndocument.documentElement.classList.add(\"js\")\n\n/* Set up navigation observables and subjects */\nconst document$ = watchDocument()\nconst location$ = watchLocation()\nconst target$   = watchLocationTarget(location$)\nconst keyboard$ = watchKeyboard()\n\n/* Set up media observables */\nconst viewport$ = watchViewport()\nconst tablet$   = watchMedia(\"(min-width: 960px)\")\nconst screen$   = watchMedia(\"(min-width: 1220px)\")\nconst print$    = watchPrint()\n\n/* Retrieve search index, if search is enabled */\nconst config = configuration()\nconst index$ = document.forms.namedItem(\"search\")\n  ? fetchSearchIndex()\n  : NEVER\n\n/* Set up Clipboard.js integration */\nconst alert$ = new Subject<string>()\nsetupClipboardJS({ alert$ })\n\n/* Set up progress indicator */\nconst progress$ = new Subject<number>()\n\n/* Set up instant navigation, if enabled */\nif (feature(\"navigation.instant\"))\n  setupInstantNavigation({ location$, viewport$, progress$ })\n    .subscribe(document$)\n\n/* Set up version selector */\nif (config.version?.provider === \"mike\")\n  setupVersionSelector({ document$ })\n\n/* Always close drawer and search on navigation */\nmerge(location$, target$)\n  .pipe(\n    delay(125)\n  )\n    .subscribe(() => {\n      setToggle(\"drawer\", false)\n      setToggle(\"search\", false)\n    })\n\n/* Set up global keyboard handlers */\nkeyboard$\n  .pipe(\n    filter(({ mode }) => mode === \"global\")\n  )\n    .subscribe(key => {\n      switch (key.type) {\n\n        /* Go to previous page */\n        case \"p\":\n        case \",\":\n          const prev = getOptionalElement<HTMLLinkElement>(\"link[rel=prev]\")\n          if (typeof prev !== \"undefined\")\n            setLocation(prev)\n          break\n\n        /* Go to next page */\n        case \"n\":\n        case \".\":\n          const next = getOptionalElement<HTMLLinkElement>(\"link[rel=next]\")\n          if (typeof next !== \"undefined\")\n            setLocation(next)\n          break\n\n        /* Expand navigation, see https://bit.ly/3ZjG5io */\n        case \"Enter\":\n          const active = getActiveElement()\n          if (active instanceof HTMLLabelElement)\n            active.click()\n      }\n    })\n\n/* Set up patches */\npatchEllipsis({ viewport$, document$ })\npatchIndeterminate({ document$, tablet$ })\npatchScrollfix({ document$ })\npatchScrolllock({ viewport$, tablet$ })\n\n/* Set up header and main area observable */\nconst header$ = watchHeader(getComponentElement(\"header\"), { viewport$ })\nconst main$ = document$\n  .pipe(\n    map(() => getComponentElement(\"main\")),\n    switchMap(el => watchMain(el, { viewport$, header$ })),\n    shareReplay(1)\n  )\n\n/* Set up control component observables */\nconst control$ = merge(\n\n  /* Consent */\n  ...getComponentElements(\"consent\")\n    .map(el => mountConsent(el, { target$ })),\n\n  /* Dialog */\n  ...getComponentElements(\"dialog\")\n    .map(el => mountDialog(el, { alert$ })),\n\n  /* Header */\n  ...getComponentElements(\"header\")\n    .map(el => mountHeader(el, { viewport$, header$, main$ })),\n\n  /* Color palette */\n  ...getComponentElements(\"palette\")\n    .map(el => mountPalette(el)),\n\n  /* Progress bar */\n  ...getComponentElements(\"progress\")\n    .map(el => mountProgress(el, { progress$ })),\n\n  /* Search */\n  ...getComponentElements(\"search\")\n    .map(el => mountSearch(el, { index$, keyboard$ })),\n\n  /* Repository information */\n  ...getComponentElements(\"source\")\n    .map(el => mountSource(el))\n)\n\n/* Set up content component observables */\nconst content$ = defer(() => merge(\n\n  /* Announcement bar */\n  ...getComponentElements(\"announce\")\n    .map(el => mountAnnounce(el)),\n\n  /* Content */\n  ...getComponentElements(\"content\")\n    .map(el => mountContent(el, { viewport$, target$, print$ })),\n\n  /* Search highlighting */\n  ...getComponentElements(\"content\")\n    .map(el => feature(\"search.highlight\")\n      ? mountSearchHiglight(el, { index$, location$ })\n      : EMPTY\n    ),\n\n  /* Header title */\n  ...getComponentElements(\"header-title\")\n    .map(el => mountHeaderTitle(el, { viewport$, header$ })),\n\n  /* Sidebar */\n  ...getComponentElements(\"sidebar\")\n    .map(el => el.getAttribute(\"data-md-type\") === \"navigation\"\n      ? at(screen$, () => mountSidebar(el, { viewport$, header$, main$ }))\n      : at(tablet$, () => mountSidebar(el, { viewport$, header$, main$ }))\n    ),\n\n  /* Navigation tabs */\n  ...getComponentElements(\"tabs\")\n    .map(el => mountTabs(el, { viewport$, header$ })),\n\n  /* Table of contents */\n  ...getComponentElements(\"toc\")\n    .map(el => mountTableOfContents(el, {\n      viewport$, header$, main$, target$\n    })),\n\n  /* Back-to-top button */\n  ...getComponentElements(\"top\")\n    .map(el => mountBackToTop(el, { viewport$, header$, main$, target$ }))\n))\n\n/* Set up component observables */\nconst component$ = document$\n  .pipe(\n    switchMap(() => content$),\n    mergeWith(control$),\n    shareReplay(1)\n  )\n\n/* Subscribe to all components */\ncomponent$.subscribe()\n\n/* ----------------------------------------------------------------------------\n * Exports\n * ------------------------------------------------------------------------- */\n\nwindow.document$  = document$          /* Document observable */\nwindow.location$  = location$          /* Location subject */\nwindow.target$    = target$            /* Location target observable */\nwindow.keyboard$  = keyboard$          /* Keyboard observable */\nwindow.viewport$  = viewport$          /* Viewport observable */\nwindow.tablet$    = tablet$            /* Media tablet observable */\nwindow.screen$    = screen$            /* Media screen observable */\nwindow.print$     = print$             /* Media print observable */\nwindow.alert$     = alert$             /* Alert subject */\nwindow.progress$  = progress$          /* Progress indicator subject */\nwindow.component$ = component$         /* Component observable */\n", "/*! *****************************************************************************\r\nCopyright (c) Microsoft Corporation.\r\n\r\nPermission to use, copy, modify, and/or distribute this software for any\r\npurpose with or without fee is hereby granted.\r\n\r\nTHE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH\r\nREGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY\r\nAND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,\r\nINDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM\r\nLOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR\r\nOTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR\r\nPERFORMANCE OF THIS SOFTWARE.\r\n***************************************************************************** */\r\n/* global Reflect, Promise */\r\n\r\nvar extendStatics = function(d, b) {\r\n    extendStatics = Object.setPrototypeOf ||\r\n        ({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) ||\r\n        function (d, b) { for (var p in b) if (Object.prototype.hasOwnProperty.call(b, p)) d[p] = b[p]; };\r\n    return extendStatics(d, b);\r\n};\r\n\r\nexport function __extends(d, b) {\r\n    if (typeof b !== \"function\" && b !== null)\r\n        throw new TypeError(\"Class extends value \" + String(b) + \" is not a constructor or null\");\r\n    extendStatics(d, b);\r\n    function __() { this.constructor = d; }\r\n    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());\r\n}\r\n\r\nexport var __assign = function() {\r\n    __assign = Object.assign || function __assign(t) {\r\n        for (var s, i = 1, n = arguments.length; i < n; i++) {\r\n            s = arguments[i];\r\n            for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p];\r\n        }\r\n        return t;\r\n    }\r\n    return __assign.apply(this, arguments);\r\n}\r\n\r\nexport function __rest(s, e) {\r\n    var t = {};\r\n    for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0)\r\n        t[p] = s[p];\r\n    if (s != null && typeof Object.getOwnPropertySymbols === \"function\")\r\n        for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) {\r\n            if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i]))\r\n                t[p[i]] = s[p[i]];\r\n        }\r\n    return t;\r\n}\r\n\r\nexport function __decorate(decorators, target, key, desc) {\r\n    var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;\r\n    if (typeof Reflect === \"object\" && typeof Reflect.decorate === \"function\") r = Reflect.decorate(decorators, target, key, desc);\r\n    else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;\r\n    return c > 3 && r && Object.defineProperty(target, key, r), r;\r\n}\r\n\r\nexport function __param(paramIndex, decorator) {\r\n    return function (target, key) { decorator(target, key, paramIndex); }\r\n}\r\n\r\nexport function __metadata(metadataKey, metadataValue) {\r\n    if (typeof Reflect === \"object\" && typeof Reflect.metadata === \"function\") return Reflect.metadata(metadataKey, metadataValue);\r\n}\r\n\r\nexport function __awaiter(thisArg, _arguments, P, generator) {\r\n    function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }\r\n    return new (P || (P = Promise))(function (resolve, reject) {\r\n        function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }\r\n        function rejected(value) { try { step(generator[\"throw\"](value)); } catch (e) { reject(e); } }\r\n        function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }\r\n        step((generator = generator.apply(thisArg, _arguments || [])).next());\r\n    });\r\n}\r\n\r\nexport function __generator(thisArg, body) {\r\n    var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;\r\n    return g = { next: verb(0), \"throw\": verb(1), \"return\": verb(2) }, typeof Symbol === \"function\" && (g[Symbol.iterator] = function() { return this; }), g;\r\n    function verb(n) { return function (v) { return step([n, v]); }; }\r\n    function step(op) {\r\n        if (f) throw new TypeError(\"Generator is already executing.\");\r\n        while (_) try {\r\n            if (f = 1, y && (t = op[0] & 2 ? y[\"return\"] : op[0] ? y[\"throw\"] || ((t = y[\"return\"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;\r\n            if (y = 0, t) op = [op[0] & 2, t.value];\r\n            switch (op[0]) {\r\n                case 0: case 1: t = op; break;\r\n                case 4: _.label++; return { value: op[1], done: false };\r\n                case 5: _.label++; y = op[1]; op = [0]; continue;\r\n                case 7: op = _.ops.pop(); _.trys.pop(); continue;\r\n                default:\r\n                    if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }\r\n                    if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }\r\n                    if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }\r\n                    if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }\r\n                    if (t[2]) _.ops.pop();\r\n                    _.trys.pop(); continue;\r\n            }\r\n            op = body.call(thisArg, _);\r\n        } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }\r\n        if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };\r\n    }\r\n}\r\n\r\nexport var __createBinding = Object.create ? (function(o, m, k, k2) {\r\n    if (k2 === undefined) k2 = k;\r\n    Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });\r\n}) : (function(o, m, k, k2) {\r\n    if (k2 === undefined) k2 = k;\r\n    o[k2] = m[k];\r\n});\r\n\r\nexport function __exportStar(m, o) {\r\n    for (var p in m) if (p !== \"default\" && !Object.prototype.hasOwnProperty.call(o, p)) __createBinding(o, m, p);\r\n}\r\n\r\nexport function __values(o) {\r\n    var s = typeof Symbol === \"function\" && Symbol.iterator, m = s && o[s], i = 0;\r\n    if (m) return m.call(o);\r\n    if (o && typeof o.length === \"number\") return {\r\n        next: function () {\r\n            if (o && i >= o.length) o = void 0;\r\n            return { value: o && o[i++], done: !o };\r\n        }\r\n    };\r\n    throw new TypeError(s ? \"Object is not iterable.\" : \"Symbol.iterator is not defined.\");\r\n}\r\n\r\nexport function __read(o, n) {\r\n    var m = typeof Symbol === \"function\" && o[Symbol.iterator];\r\n    if (!m) return o;\r\n    var i = m.call(o), r, ar = [], e;\r\n    try {\r\n        while ((n === void 0 || n-- > 0) && !(r = i.next()).done) ar.push(r.value);\r\n    }\r\n    catch (error) { e = { error: error }; }\r\n    finally {\r\n        try {\r\n            if (r && !r.done && (m = i[\"return\"])) m.call(i);\r\n        }\r\n        finally { if (e) throw e.error; }\r\n    }\r\n    return ar;\r\n}\r\n\r\n/** @deprecated */\r\nexport function __spread() {\r\n    for (var ar = [], i = 0; i < arguments.length; i++)\r\n        ar = ar.concat(__read(arguments[i]));\r\n    return ar;\r\n}\r\n\r\n/** @deprecated */\r\nexport function __spreadArrays() {\r\n    for (var s = 0, i = 0, il = arguments.length; i < il; i++) s += arguments[i].length;\r\n    for (var r = Array(s), k = 0, i = 0; i < il; i++)\r\n        for (var a = arguments[i], j = 0, jl = a.length; j < jl; j++, k++)\r\n            r[k] = a[j];\r\n    return r;\r\n}\r\n\r\nexport function __spreadArray(to, from, pack) {\r\n    if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) {\r\n        if (ar || !(i in from)) {\r\n            if (!ar) ar = Array.prototype.slice.call(from, 0, i);\r\n            ar[i] = from[i];\r\n        }\r\n    }\r\n    return to.concat(ar || Array.prototype.slice.call(from));\r\n}\r\n\r\nexport function __await(v) {\r\n    return this instanceof __await ? (this.v = v, this) : new __await(v);\r\n}\r\n\r\nexport function __asyncGenerator(thisArg, _arguments, generator) {\r\n    if (!Symbol.asyncIterator) throw new TypeError(\"Symbol.asyncIterator is not defined.\");\r\n    var g = generator.apply(thisArg, _arguments || []), i, q = [];\r\n    return i = {}, verb(\"next\"), verb(\"throw\"), verb(\"return\"), i[Symbol.asyncIterator] = function () { return this; }, i;\r\n    function verb(n) { if (g[n]) i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; }\r\n    function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }\r\n    function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }\r\n    function fulfill(value) { resume(\"next\", value); }\r\n    function reject(value) { resume(\"throw\", value); }\r\n    function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }\r\n}\r\n\r\nexport function __asyncDelegator(o) {\r\n    var i, p;\r\n    return i = {}, verb(\"next\"), verb(\"throw\", function (e) { throw e; }), verb(\"return\"), i[Symbol.iterator] = function () { return this; }, i;\r\n    function verb(n, f) { i[n] = o[n] ? function (v) { return (p = !p) ? { value: __await(o[n](v)), done: n === \"return\" } : f ? f(v) : v; } : f; }\r\n}\r\n\r\nexport function __asyncValues(o) {\r\n    if (!Symbol.asyncIterator) throw new TypeError(\"Symbol.asyncIterator is not defined.\");\r\n    var m = o[Symbol.asyncIterator], i;\r\n    return m ? m.call(o) : (o = typeof __values === \"function\" ? __values(o) : o[Symbol.iterator](), i = {}, verb(\"next\"), verb(\"throw\"), verb(\"return\"), i[Symbol.asyncIterator] = function () { return this; }, i);\r\n    function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }\r\n    function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }\r\n}\r\n\r\nexport function __makeTemplateObject(cooked, raw) {\r\n    if (Object.defineProperty) { Object.defineProperty(cooked, \"raw\", { value: raw }); } else { cooked.raw = raw; }\r\n    return cooked;\r\n};\r\n\r\nvar __setModuleDefault = Object.create ? (function(o, v) {\r\n    Object.defineProperty(o, \"default\", { enumerable: true, value: v });\r\n}) : function(o, v) {\r\n    o[\"default\"] = v;\r\n};\r\n\r\nexport function __importStar(mod) {\r\n    if (mod && mod.__esModule) return mod;\r\n    var result = {};\r\n    if (mod != null) for (var k in mod) if (k !== \"default\" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);\r\n    __setModuleDefault(result, mod);\r\n    return result;\r\n}\r\n\r\nexport function __importDefault(mod) {\r\n    return (mod && mod.__esModule) ? mod : { default: mod };\r\n}\r\n\r\nexport function __classPrivateFieldGet(receiver, state, kind, f) {\r\n    if (kind === \"a\" && !f) throw new TypeError(\"Private accessor was defined without a getter\");\r\n    if (typeof state === \"function\" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError(\"Cannot read private member from an object whose class did not declare it\");\r\n    return kind === \"m\" ? f : kind === \"a\" ? f.call(receiver) : f ? f.value : state.get(receiver);\r\n}\r\n\r\nexport function __classPrivateFieldSet(receiver, state, value, kind, f) {\r\n    if (kind === \"m\") throw new TypeError(\"Private method is not writable\");\r\n    if (kind === \"a\" && !f) throw new TypeError(\"Private accessor was defined without a setter\");\r\n    if (typeof state === \"function\" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError(\"Cannot write private member to an object whose class did not declare it\");\r\n    return (kind === \"a\" ? f.call(receiver, value) : f ? f.value = value : state.set(receiver, value)), value;\r\n}\r\n", "/**\n * Returns true if the object is a function.\n * @param value The value to check\n */\nexport function isFunction(value: any): value is (...args: any[]) => any {\n  return typeof value === 'function';\n}\n", "/**\n * Used to create Error subclasses until the community moves away from ES5.\n *\n * This is because compiling from TypeScript down to ES5 has issues with subclassing Errors\n * as well as other built-in types: https://github.com/Microsoft/TypeScript/issues/12123\n *\n * @param createImpl A factory function to create the actual constructor implementation. The returned\n * function should be a named function that calls `_super` internally.\n */\nexport function createErrorClass<T>(createImpl: (_super: any) => any): T {\n  const _super = (instance: any) => {\n    Error.call(instance);\n    instance.stack = new Error().stack;\n  };\n\n  const ctorFunc = createImpl(_super);\n  ctorFunc.prototype = Object.create(Error.prototype);\n  ctorFunc.prototype.constructor = ctorFunc;\n  return ctorFunc;\n}\n", "import { createErrorClass } from './createErrorClass';\n\nexport interface UnsubscriptionError extends Error {\n  readonly errors: any[];\n}\n\nexport interface UnsubscriptionErrorCtor {\n  /**\n   * @deprecated Internal implementation detail. Do not construct error instances.\n   * Cannot be tagged as internal: https://github.com/ReactiveX/rxjs/issues/6269\n   */\n  new (errors: any[]): UnsubscriptionError;\n}\n\n/**\n * An error thrown when one or more errors have occurred during the\n * `unsubscribe` of a {@link Subscription}.\n */\nexport const UnsubscriptionError: UnsubscriptionErrorCtor = createErrorClass(\n  (_super) =>\n    function UnsubscriptionErrorImpl(this: any, errors: (Error | string)[]) {\n      _super(this);\n      this.message = errors\n        ? `${errors.length} errors occurred during unsubscription:\n${errors.map((err, i) => `${i + 1}) ${err.toString()}`).join('\\n  ')}`\n        : '';\n      this.name = 'UnsubscriptionError';\n      this.errors = errors;\n    }\n);\n", "/**\n * Removes an item from an array, mutating it.\n * @param arr The array to remove the item from\n * @param item The item to remove\n */\nexport function arrRemove<T>(arr: T[] | undefined | null, item: T) {\n  if (arr) {\n    const index = arr.indexOf(item);\n    0 <= index && arr.splice(index, 1);\n  }\n}\n", "import { isFunction } from './util/isFunction';\nimport { UnsubscriptionError } from './util/UnsubscriptionError';\nimport { SubscriptionLike, TeardownLogic, Unsubscribable } from './types';\nimport { arrRemove } from './util/arrRemove';\n\n/**\n * Represents a disposable resource, such as the execution of an Observable. A\n * Subscription has one important method, `unsubscribe`, that takes no argument\n * and just disposes the resource held by the subscription.\n *\n * Additionally, subscriptions may be grouped together through the `add()`\n * method, which will attach a child Subscription to the current Subscription.\n * When a Subscription is unsubscribed, all its children (and its grandchildren)\n * will be unsubscribed as well.\n *\n * @class Subscription\n */\nexport class Subscription implements SubscriptionLike {\n  /** @nocollapse */\n  public static EMPTY = (() => {\n    const empty = new Subscription();\n    empty.closed = true;\n    return empty;\n  })();\n\n  /**\n   * A flag to indicate whether this Subscription has already been unsubscribed.\n   */\n  public closed = false;\n\n  private _parentage: Subscription[] | Subscription | null = null;\n\n  /**\n   * The list of registered finalizers to execute upon unsubscription. Adding and removing from this\n   * list occurs in the {@link #add} and {@link #remove} methods.\n   */\n  private _finalizers: Exclude<TeardownLogic, void>[] | null = null;\n\n  /**\n   * @param initialTeardown A function executed first as part of the finalization\n   * process that is kicked off when {@link #unsubscribe} is called.\n   */\n  constructor(private initialTeardown?: () => void) {}\n\n  /**\n   * Disposes the resources held by the subscription. May, for instance, cancel\n   * an ongoing Observable execution or cancel any other type of work that\n   * started when the Subscription was created.\n   * @return {void}\n   */\n  unsubscribe(): void {\n    let errors: any[] | undefined;\n\n    if (!this.closed) {\n      this.closed = true;\n\n      // Remove this from it's parents.\n      const { _parentage } = this;\n      if (_parentage) {\n        this._parentage = null;\n        if (Array.isArray(_parentage)) {\n          for (const parent of _parentage) {\n            parent.remove(this);\n          }\n        } else {\n          _parentage.remove(this);\n        }\n      }\n\n      const { initialTeardown: initialFinalizer } = this;\n      if (isFunction(initialFinalizer)) {\n        try {\n          initialFinalizer();\n        } catch (e) {\n          errors = e instanceof UnsubscriptionError ? e.errors : [e];\n        }\n      }\n\n      const { _finalizers } = this;\n      if (_finalizers) {\n        this._finalizers = null;\n        for (const finalizer of _finalizers) {\n          try {\n            execFinalizer(finalizer);\n          } catch (err) {\n            errors = errors ?? [];\n            if (err instanceof UnsubscriptionError) {\n              errors = [...errors, ...err.errors];\n            } else {\n              errors.push(err);\n            }\n          }\n        }\n      }\n\n      if (errors) {\n        throw new UnsubscriptionError(errors);\n      }\n    }\n  }\n\n  /**\n   * Adds a finalizer to this subscription, so that finalization will be unsubscribed/called\n   * when this subscription is unsubscribed. If this subscription is already {@link #closed},\n   * because it has already been unsubscribed, then whatever finalizer is passed to it\n   * will automatically be executed (unless the finalizer itself is also a closed subscription).\n   *\n   * Closed Subscriptions cannot be added as finalizers to any subscription. Adding a closed\n   * subscription to a any subscription will result in no operation. (A noop).\n   *\n   * Adding a subscription to itself, or adding `null` or `undefined` will not perform any\n   * operation at all. (A noop).\n   *\n   * `Subscription` instances that are added to this instance will automatically remove themselves\n   * if they are unsubscribed. Functions and {@link Unsubscribable} objects that you wish to remove\n   * will need to be removed manually with {@link #remove}\n   *\n   * @param teardown The finalization logic to add to this subscription.\n   */\n  add(teardown: TeardownLogic): void {\n    // Only add the finalizer if it's not undefined\n    // and don't add a subscription to itself.\n    if (teardown && teardown !== this) {\n      if (this.closed) {\n        // If this subscription is already closed,\n        // execute whatever finalizer is handed to it automatically.\n        execFinalizer(teardown);\n      } else {\n        if (teardown instanceof Subscription) {\n          // We don't add closed subscriptions, and we don't add the same subscription\n          // twice. Subscription unsubscribe is idempotent.\n          if (teardown.closed || teardown._hasParent(this)) {\n            return;\n          }\n          teardown._addParent(this);\n        }\n        (this._finalizers = this._finalizers ?? []).push(teardown);\n      }\n    }\n  }\n\n  /**\n   * Checks to see if a this subscription already has a particular parent.\n   * This will signal that this subscription has already been added to the parent in question.\n   * @param parent the parent to check for\n   */\n  private _hasParent(parent: Subscription) {\n    const { _parentage } = this;\n    return _parentage === parent || (Array.isArray(_parentage) && _parentage.includes(parent));\n  }\n\n  /**\n   * Adds a parent to this subscription so it can be removed from the parent if it\n   * unsubscribes on it's own.\n   *\n   * NOTE: THIS ASSUMES THAT {@link _hasParent} HAS ALREADY BEEN CHECKED.\n   * @param parent The parent subscription to add\n   */\n  private _addParent(parent: Subscription) {\n    const { _parentage } = this;\n    this._parentage = Array.isArray(_parentage) ? (_parentage.push(parent), _parentage) : _parentage ? [_parentage, parent] : parent;\n  }\n\n  /**\n   * Called on a child when it is removed via {@link #remove}.\n   * @param parent The parent to remove\n   */\n  private _removeParent(parent: Subscription) {\n    const { _parentage } = this;\n    if (_parentage === parent) {\n      this._parentage = null;\n    } else if (Array.isArray(_parentage)) {\n      arrRemove(_parentage, parent);\n    }\n  }\n\n  /**\n   * Removes a finalizer from this subscription that was previously added with the {@link #add} method.\n   *\n   * Note that `Subscription` instances, when unsubscribed, will automatically remove themselves\n   * from every other `Subscription` they have been added to. This means that using the `remove` method\n   * is not a common thing and should be used thoughtfully.\n   *\n   * If you add the same finalizer instance of a function or an unsubscribable object to a `Subscription` instance\n   * more than once, you will need to call `remove` the same number of times to remove all instances.\n   *\n   * All finalizer instances are removed to free up memory upon unsubscription.\n   *\n   * @param teardown The finalizer to remove from this subscription\n   */\n  remove(teardown: Exclude<TeardownLogic, void>): void {\n    const { _finalizers } = this;\n    _finalizers && arrRemove(_finalizers, teardown);\n\n    if (teardown instanceof Subscription) {\n      teardown._removeParent(this);\n    }\n  }\n}\n\nexport const EMPTY_SUBSCRIPTION = Subscription.EMPTY;\n\nexport function isSubscription(value: any): value is Subscription {\n  return (\n    value instanceof Subscription ||\n    (value && 'closed' in value && isFunction(value.remove) && isFunction(value.add) && isFunction(value.unsubscribe))\n  );\n}\n\nfunction execFinalizer(finalizer: Unsubscribable | (() => void)) {\n  if (isFunction(finalizer)) {\n    finalizer();\n  } else {\n    finalizer.unsubscribe();\n  }\n}\n", "import { Subscriber } from './Subscriber';\nimport { ObservableNotification } from './types';\n\n/**\n * The {@link GlobalConfig} object for RxJS. It is used to configure things\n * like how to react on unhandled errors.\n */\nexport const config: GlobalConfig = {\n  onUnhandledError: null,\n  onStoppedNotification: null,\n  Promise: undefined,\n  useDeprecatedSynchronousErrorHandling: false,\n  useDeprecatedNextContext: false,\n};\n\n/**\n * The global configuration object for RxJS, used to configure things\n * like how to react on unhandled errors. Accessible via {@link config}\n * object.\n */\nexport interface GlobalConfig {\n  /**\n   * A registration point for unhandled errors from RxJS. These are errors that\n   * cannot were not handled by consuming code in the usual subscription path. For\n   * example, if you have this configured, and you subscribe to an observable without\n   * providing an error handler, errors from that subscription will end up here. This\n   * will _always_ be called asynchronously on another job in the runtime. This is because\n   * we do not want errors thrown in this user-configured handler to interfere with the\n   * behavior of the library.\n   */\n  onUnhandledError: ((err: any) => void) | null;\n\n  /**\n   * A registration point for notifications that cannot be sent to subscribers because they\n   * have completed, errored or have been explicitly unsubscribed. By default, next, complete\n   * and error notifications sent to stopped subscribers are noops. However, sometimes callers\n   * might want a different behavior. For example, with sources that attempt to report errors\n   * to stopped subscribers, a caller can configure RxJS to throw an unhandled error instead.\n   * This will _always_ be called asynchronously on another job in the runtime. This is because\n   * we do not want errors thrown in this user-configured handler to interfere with the\n   * behavior of the library.\n   */\n  onStoppedNotification: ((notification: ObservableNotification<any>, subscriber: Subscriber<any>) => void) | null;\n\n  /**\n   * The promise constructor used by default for {@link Observable#toPromise toPromise} and {@link Observable#forEach forEach}\n   * methods.\n   *\n   * @deprecated As of version 8, RxJS will no longer support this sort of injection of a\n   * Promise constructor. If you need a Promise implementation other than native promises,\n   * please polyfill/patch Promise as you see appropriate. Will be removed in v8.\n   */\n  Promise?: PromiseConstructorLike;\n\n  /**\n   * If true, turns on synchronous error rethrowing, which is a deprecated behavior\n   * in v6 and higher. This behavior enables bad patterns like wrapping a subscribe\n   * call in a try/catch block. It also enables producer interference, a nasty bug\n   * where a multicast can be broken for all observers by a downstream consumer with\n   * an unhandled error. DO NOT USE THIS FLAG UNLESS IT'S NEEDED TO BUY TIME\n   * FOR MIGRATION REASONS.\n   *\n   * @deprecated As of version 8, RxJS will no longer support synchronous throwing\n   * of unhandled errors. All errors will be thrown on a separate call stack to prevent bad\n   * behaviors described above. Will be removed in v8.\n   */\n  useDeprecatedSynchronousErrorHandling: boolean;\n\n  /**\n   * If true, enables an as-of-yet undocumented feature from v5: The ability to access\n   * `unsubscribe()` via `this` context in `next` functions created in observers passed\n   * to `subscribe`.\n   *\n   * This is being removed because the performance was severely problematic, and it could also cause\n   * issues when types other than POJOs are passed to subscribe as subscribers, as they will likely have\n   * their `this` context overwritten.\n   *\n   * @deprecated As of version 8, RxJS will no longer support altering the\n   * context of next functions provided as part of an observer to Subscribe. Instead,\n   * you will have access to a subscription or a signal or token that will allow you to do things like\n   * unsubscribe and test closed status. Will be removed in v8.\n   */\n  useDeprecatedNextContext: boolean;\n}\n", "import type { TimerHandle } from './timerHandle';\ntype SetTimeoutFunction = (handler: () => void, timeout?: number, ...args: any[]) => TimerHandle;\ntype ClearTimeoutFunction = (handle: TimerHandle) => void;\n\ninterface TimeoutProvider {\n  setTimeout: SetTimeoutFunction;\n  clearTimeout: ClearTimeoutFunction;\n  delegate:\n    | {\n        setTimeout: SetTimeoutFunction;\n        clearTimeout: ClearTimeoutFunction;\n      }\n    | undefined;\n}\n\nexport const timeoutProvider: TimeoutProvider = {\n  // When accessing the delegate, use the variable rather than `this` so that\n  // the functions can be called without being bound to the provider.\n  setTimeout(handler: () => void, timeout?: number, ...args) {\n    const { delegate } = timeoutProvider;\n    if (delegate?.setTimeout) {\n      return delegate.setTimeout(handler, timeout, ...args);\n    }\n    return setTimeout(handler, timeout, ...args);\n  },\n  clearTimeout(handle) {\n    const { delegate } = timeoutProvider;\n    return (delegate?.clearTimeout || clearTimeout)(handle as any);\n  },\n  delegate: undefined,\n};\n", "import { config } from '../config';\nimport { timeoutProvider } from '../scheduler/timeoutProvider';\n\n/**\n * Handles an error on another job either with the user-configured {@link onUnhandledError},\n * or by throwing it on that new job so it can be picked up by `window.onerror`, `process.on('error')`, etc.\n *\n * This should be called whenever there is an error that is out-of-band with the subscription\n * or when an error hits a terminal boundary of the subscription and no error handler was provided.\n *\n * @param err the error to report\n */\nexport function reportUnhandledError(err: any) {\n  timeoutProvider.setTimeout(() => {\n    const { onUnhandledError } = config;\n    if (onUnhandledError) {\n      // Execute the user-configured error handler.\n      onUnhandledError(err);\n    } else {\n      // Throw so it is picked up by the runtime's uncaught error mechanism.\n      throw err;\n    }\n  });\n}\n", "/* tslint:disable:no-empty */\nexport function noop() { }\n", "import { CompleteNotification, NextNotification, ErrorNotification } from './types';\n\n/**\n * A completion object optimized for memory use and created to be the\n * same \"shape\" as other notifications in v8.\n * @internal\n */\nexport const COMPLETE_NOTIFICATION = (() => createNotification('C', undefined, undefined) as CompleteNotification)();\n\n/**\n * Internal use only. Creates an optimized error notification that is the same \"shape\"\n * as other notifications.\n * @internal\n */\nexport function errorNotification(error: any): ErrorNotification {\n  return createNotification('E', undefined, error) as any;\n}\n\n/**\n * Internal use only. Creates an optimized next notification that is the same \"shape\"\n * as other notifications.\n * @internal\n */\nexport function nextNotification<T>(value: T) {\n  return createNotification('N', value, undefined) as NextNotification<T>;\n}\n\n/**\n * Ensures that all notifications created internally have the same \"shape\" in v8.\n *\n * TODO: This is only exported to support a crazy legacy test in `groupBy`.\n * @internal\n */\nexport function createNotification(kind: 'N' | 'E' | 'C', value: any, error: any) {\n  return {\n    kind,\n    value,\n    error,\n  };\n}\n", "import { config } from '../config';\n\nlet context: { errorThrown: boolean; error: any } | null = null;\n\n/**\n * Handles dealing with errors for super-gross mode. Creates a context, in which\n * any synchronously thrown errors will be passed to {@link captureError}. Which\n * will record the error such that it will be rethrown after the call back is complete.\n * TODO: Remove in v8\n * @param cb An immediately executed function.\n */\nexport function errorContext(cb: () => void) {\n  if (config.useDeprecatedSynchronousErrorHandling) {\n    const isRoot = !context;\n    if (isRoot) {\n      context = { errorThrown: false, error: null };\n    }\n    cb();\n    if (isRoot) {\n      const { errorThrown, error } = context!;\n      context = null;\n      if (errorThrown) {\n        throw error;\n      }\n    }\n  } else {\n    // This is the general non-deprecated path for everyone that\n    // isn't crazy enough to use super-gross mode (useDeprecatedSynchronousErrorHandling)\n    cb();\n  }\n}\n\n/**\n * Captures errors only in super-gross mode.\n * @param err the error to capture\n */\nexport function captureError(err: any) {\n  if (config.useDeprecatedSynchronousErrorHandling && context) {\n    context.errorThrown = true;\n    context.error = err;\n  }\n}\n", "import { isFunction } from './util/isFunction';\nimport { Observer, ObservableNotification } from './types';\nimport { isSubscription, Subscription } from './Subscription';\nimport { config } from './config';\nimport { reportUnhandledError } from './util/reportUnhandledError';\nimport { noop } from './util/noop';\nimport { nextNotification, errorNotification, COMPLETE_NOTIFICATION } from './NotificationFactories';\nimport { timeoutProvider } from './scheduler/timeoutProvider';\nimport { captureError } from './util/errorContext';\n\n/**\n * Implements the {@link Observer} interface and extends the\n * {@link Subscription} class. While the {@link Observer} is the public API for\n * consuming the values of an {@link Observable}, all Observers get converted to\n * a Subscriber, in order to provide Subscription-like capabilities such as\n * `unsubscribe`. Subscriber is a common type in RxJS, and crucial for\n * implementing operators, but it is rarely used as a public API.\n *\n * @class Subscriber<T>\n */\nexport class Subscriber<T> extends Subscription implements Observer<T> {\n  /**\n   * A static factory for a Subscriber, given a (potentially partial) definition\n   * of an Observer.\n   * @param next The `next` callback of an Observer.\n   * @param error The `error` callback of an\n   * Observer.\n   * @param complete The `complete` callback of an\n   * Observer.\n   * @return A Subscriber wrapping the (partially defined)\n   * Observer represented by the given arguments.\n   * @nocollapse\n   * @deprecated Do not use. Will be removed in v8. There is no replacement for this\n   * method, and there is no reason to be creating instances of `Subscriber` directly.\n   * If you have a specific use case, please file an issue.\n   */\n  static create<T>(next?: (x?: T) => void, error?: (e?: any) => void, complete?: () => void): Subscriber<T> {\n    return new SafeSubscriber(next, error, complete);\n  }\n\n  /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n  protected isStopped: boolean = false;\n  /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n  protected destination: Subscriber<any> | Observer<any>; // this `any` is the escape hatch to erase extra type param (e.g. R)\n\n  /**\n   * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n   * There is no reason to directly create an instance of Subscriber. This type is exported for typings reasons.\n   */\n  constructor(destination?: Subscriber<any> | Observer<any>) {\n    super();\n    if (destination) {\n      this.destination = destination;\n      // Automatically chain subscriptions together here.\n      // if destination is a Subscription, then it is a Subscriber.\n      if (isSubscription(destination)) {\n        destination.add(this);\n      }\n    } else {\n      this.destination = EMPTY_OBSERVER;\n    }\n  }\n\n  /**\n   * The {@link Observer} callback to receive notifications of type `next` from\n   * the Observable, with a value. The Observable may call this method 0 or more\n   * times.\n   * @param {T} [value] The `next` value.\n   * @return {void}\n   */\n  next(value?: T): void {\n    if (this.isStopped) {\n      handleStoppedNotification(nextNotification(value), this);\n    } else {\n      this._next(value!);\n    }\n  }\n\n  /**\n   * The {@link Observer} callback to receive notifications of type `error` from\n   * the Observable, with an attached `Error`. Notifies the Observer that\n   * the Observable has experienced an error condition.\n   * @param {any} [err] The `error` exception.\n   * @return {void}\n   */\n  error(err?: any): void {\n    if (this.isStopped) {\n      handleStoppedNotification(errorNotification(err), this);\n    } else {\n      this.isStopped = true;\n      this._error(err);\n    }\n  }\n\n  /**\n   * The {@link Observer} callback to receive a valueless notification of type\n   * `complete` from the Observable. Notifies the Observer that the Observable\n   * has finished sending push-based notifications.\n   * @return {void}\n   */\n  complete(): void {\n    if (this.isStopped) {\n      handleStoppedNotification(COMPLETE_NOTIFICATION, this);\n    } else {\n      this.isStopped = true;\n      this._complete();\n    }\n  }\n\n  unsubscribe(): void {\n    if (!this.closed) {\n      this.isStopped = true;\n      super.unsubscribe();\n      this.destination = null!;\n    }\n  }\n\n  protected _next(value: T): void {\n    this.destination.next(value);\n  }\n\n  protected _error(err: any): void {\n    try {\n      this.destination.error(err);\n    } finally {\n      this.unsubscribe();\n    }\n  }\n\n  protected _complete(): void {\n    try {\n      this.destination.complete();\n    } finally {\n      this.unsubscribe();\n    }\n  }\n}\n\n/**\n * This bind is captured here because we want to be able to have\n * compatibility with monoid libraries that tend to use a method named\n * `bind`. In particular, a library called Monio requires this.\n */\nconst _bind = Function.prototype.bind;\n\nfunction bind<Fn extends (...args: any[]) => any>(fn: Fn, thisArg: any): Fn {\n  return _bind.call(fn, thisArg);\n}\n\n/**\n * Internal optimization only, DO NOT EXPOSE.\n * @internal\n */\nclass ConsumerObserver<T> implements Observer<T> {\n  constructor(private partialObserver: Partial<Observer<T>>) {}\n\n  next(value: T): void {\n    const { partialObserver } = this;\n    if (partialObserver.next) {\n      try {\n        partialObserver.next(value);\n      } catch (error) {\n        handleUnhandledError(error);\n      }\n    }\n  }\n\n  error(err: any): void {\n    const { partialObserver } = this;\n    if (partialObserver.error) {\n      try {\n        partialObserver.error(err);\n      } catch (error) {\n        handleUnhandledError(error);\n      }\n    } else {\n      handleUnhandledError(err);\n    }\n  }\n\n  complete(): void {\n    const { partialObserver } = this;\n    if (partialObserver.complete) {\n      try {\n        partialObserver.complete();\n      } catch (error) {\n        handleUnhandledError(error);\n      }\n    }\n  }\n}\n\nexport class SafeSubscriber<T> extends Subscriber<T> {\n  constructor(\n    observerOrNext?: Partial<Observer<T>> | ((value: T) => void) | null,\n    error?: ((e?: any) => void) | null,\n    complete?: (() => void) | null\n  ) {\n    super();\n\n    let partialObserver: Partial<Observer<T>>;\n    if (isFunction(observerOrNext) || !observerOrNext) {\n      // The first argument is a function, not an observer. The next\n      // two arguments *could* be observers, or they could be empty.\n      partialObserver = {\n        next: (observerOrNext ?? undefined) as (((value: T) => void) | undefined),\n        error: error ?? undefined,\n        complete: complete ?? undefined,\n      };\n    } else {\n      // The first argument is a partial observer.\n      let context: any;\n      if (this && config.useDeprecatedNextContext) {\n        // This is a deprecated path that made `this.unsubscribe()` available in\n        // next handler functions passed to subscribe. This only exists behind a flag\n        // now, as it is *very* slow.\n        context = Object.create(observerOrNext);\n        context.unsubscribe = () => this.unsubscribe();\n        partialObserver = {\n          next: observerOrNext.next && bind(observerOrNext.next, context),\n          error: observerOrNext.error && bind(observerOrNext.error, context),\n          complete: observerOrNext.complete && bind(observerOrNext.complete, context),\n        };\n      } else {\n        // The \"normal\" path. Just use the partial observer directly.\n        partialObserver = observerOrNext;\n      }\n    }\n\n    // Wrap the partial observer to ensure it's a full observer, and\n    // make sure proper error handling is accounted for.\n    this.destination = new ConsumerObserver(partialObserver);\n  }\n}\n\nfunction handleUnhandledError(error: any) {\n  if (config.useDeprecatedSynchronousErrorHandling) {\n    captureError(error);\n  } else {\n    // Ideal path, we report this as an unhandled error,\n    // which is thrown on a new call stack.\n    reportUnhandledError(error);\n  }\n}\n\n/**\n * An error handler used when no error handler was supplied\n * to the SafeSubscriber -- meaning no error handler was supplied\n * do the `subscribe` call on our observable.\n * @param err The error to handle\n */\nfunction defaultErrorHandler(err: any) {\n  throw err;\n}\n\n/**\n * A handler for notifications that cannot be sent to a stopped subscriber.\n * @param notification The notification being sent\n * @param subscriber The stopped subscriber\n */\nfunction handleStoppedNotification(notification: ObservableNotification<any>, subscriber: Subscriber<any>) {\n  const { onStoppedNotification } = config;\n  onStoppedNotification && timeoutProvider.setTimeout(() => onStoppedNotification(notification, subscriber));\n}\n\n/**\n * The observer used as a stub for subscriptions where the user did not\n * pass any arguments to `subscribe`. Comes with the default error handling\n * behavior.\n */\nexport const EMPTY_OBSERVER: Readonly<Observer<any>> & { closed: true } = {\n  closed: true,\n  next: noop,\n  error: defaultErrorHandler,\n  complete: noop,\n};\n", "/**\n * Symbol.observable or a string \"@@observable\". Used for interop\n *\n * @deprecated We will no longer be exporting this symbol in upcoming versions of RxJS.\n * Instead polyfill and use Symbol.observable directly *or* use https://www.npmjs.com/package/symbol-observable\n */\nexport const observable: string | symbol = (() => (typeof Symbol === 'function' && Symbol.observable) || '@@observable')();\n", "/**\n * This function takes one parameter and just returns it. Simply put,\n * this is like `<T>(x: T): T => x`.\n *\n * ## Examples\n *\n * This is useful in some cases when using things like `mergeMap`\n *\n * ```ts\n * import { interval, take, map, range, mergeMap, identity } from 'rxjs';\n *\n * const source$ = interval(1000).pipe(take(5));\n *\n * const result$ = source$.pipe(\n *   map(i => range(i)),\n *   mergeMap(identity) // same as mergeMap(x => x)\n * );\n *\n * result$.subscribe({\n *   next: console.log\n * });\n * ```\n *\n * Or when you want to selectively apply an operator\n *\n * ```ts\n * import { interval, take, identity } from 'rxjs';\n *\n * const shouldLimit = () => Math.random() < 0.5;\n *\n * const source$ = interval(1000);\n *\n * const result$ = source$.pipe(shouldLimit() ? take(5) : identity);\n *\n * result$.subscribe({\n *   next: console.log\n * });\n * ```\n *\n * @param x Any value that is returned by this function\n * @returns The value passed as the first parameter to this function\n */\nexport function identity<T>(x: T): T {\n  return x;\n}\n", "import { identity } from './identity';\nimport { UnaryFunction } from '../types';\n\nexport function pipe(): typeof identity;\nexport function pipe<T, A>(fn1: UnaryFunction<T, A>): UnaryFunction<T, A>;\nexport function pipe<T, A, B>(fn1: UnaryFunction<T, A>, fn2: UnaryFunction<A, B>): UnaryFunction<T, B>;\nexport function pipe<T, A, B, C>(fn1: UnaryFunction<T, A>, fn2: UnaryFunction<A, B>, fn3: UnaryFunction<B, C>): UnaryFunction<T, C>;\nexport function pipe<T, A, B, C, D>(\n  fn1: UnaryFunction<T, A>,\n  fn2: UnaryFunction<A, B>,\n  fn3: UnaryFunction<B, C>,\n  fn4: UnaryFunction<C, D>\n): UnaryFunction<T, D>;\nexport function pipe<T, A, B, C, D, E>(\n  fn1: UnaryFunction<T, A>,\n  fn2: UnaryFunction<A, B>,\n  fn3: UnaryFunction<B, C>,\n  fn4: UnaryFunction<C, D>,\n  fn5: UnaryFunction<D, E>\n): UnaryFunction<T, E>;\nexport function pipe<T, A, B, C, D, E, F>(\n  fn1: UnaryFunction<T, A>,\n  fn2: UnaryFunction<A, B>,\n  fn3: UnaryFunction<B, C>,\n  fn4: UnaryFunction<C, D>,\n  fn5: UnaryFunction<D, E>,\n  fn6: UnaryFunction<E, F>\n): UnaryFunction<T, F>;\nexport function pipe<T, A, B, C, D, E, F, G>(\n  fn1: UnaryFunction<T, A>,\n  fn2: UnaryFunction<A, B>,\n  fn3: UnaryFunction<B, C>,\n  fn4: UnaryFunction<C, D>,\n  fn5: UnaryFunction<D, E>,\n  fn6: UnaryFunction<E, F>,\n  fn7: UnaryFunction<F, G>\n): UnaryFunction<T, G>;\nexport function pipe<T, A, B, C, D, E, F, G, H>(\n  fn1: UnaryFunction<T, A>,\n  fn2: UnaryFunction<A, B>,\n  fn3: UnaryFunction<B, C>,\n  fn4: UnaryFunction<C, D>,\n  fn5: UnaryFunction<D, E>,\n  fn6: UnaryFunction<E, F>,\n  fn7: UnaryFunction<F, G>,\n  fn8: UnaryFunction<G, H>\n): UnaryFunction<T, H>;\nexport function pipe<T, A, B, C, D, E, F, G, H, I>(\n  fn1: UnaryFunction<T, A>,\n  fn2: UnaryFunction<A, B>,\n  fn3: UnaryFunction<B, C>,\n  fn4: UnaryFunction<C, D>,\n  fn5: UnaryFunction<D, E>,\n  fn6: UnaryFunction<E, F>,\n  fn7: UnaryFunction<F, G>,\n  fn8: UnaryFunction<G, H>,\n  fn9: UnaryFunction<H, I>\n): UnaryFunction<T, I>;\nexport function pipe<T, A, B, C, D, E, F, G, H, I>(\n  fn1: UnaryFunction<T, A>,\n  fn2: UnaryFunction<A, B>,\n  fn3: UnaryFunction<B, C>,\n  fn4: UnaryFunction<C, D>,\n  fn5: UnaryFunction<D, E>,\n  fn6: UnaryFunction<E, F>,\n  fn7: UnaryFunction<F, G>,\n  fn8: UnaryFunction<G, H>,\n  fn9: UnaryFunction<H, I>,\n  ...fns: UnaryFunction<any, any>[]\n): UnaryFunction<T, unknown>;\n\n/**\n * pipe() can be called on one or more functions, each of which can take one argument (\"UnaryFunction\")\n * and uses it to return a value.\n * It returns a function that takes one argument, passes it to the first UnaryFunction, and then\n * passes the result to the next one, passes that result to the next one, and so on.  \n */\nexport function pipe(...fns: Array<UnaryFunction<any, any>>): UnaryFunction<any, any> {\n  return pipeFromArray(fns);\n}\n\n/** @internal */\nexport function pipeFromArray<T, R>(fns: Array<UnaryFunction<T, R>>): UnaryFunction<T, R> {\n  if (fns.length === 0) {\n    return identity as UnaryFunction<any, any>;\n  }\n\n  if (fns.length === 1) {\n    return fns[0];\n  }\n\n  return function piped(input: T): R {\n    return fns.reduce((prev: any, fn: UnaryFunction<T, R>) => fn(prev), input as any);\n  };\n}\n", "import { Operator } from './Operator';\nimport { SafeSubscriber, Subscriber } from './Subscriber';\nimport { isSubscription, Subscription } from './Subscription';\nimport { TeardownLogic, OperatorFunction, Subscribable, Observer } from './types';\nimport { observable as Symbol_observable } from './symbol/observable';\nimport { pipeFromArray } from './util/pipe';\nimport { config } from './config';\nimport { isFunction } from './util/isFunction';\nimport { errorContext } from './util/errorContext';\n\n/**\n * A representation of any set of values over any amount of time. This is the most basic building block\n * of RxJS.\n *\n * @class Observable<T>\n */\nexport class Observable<T> implements Subscribable<T> {\n  /**\n   * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n   */\n  source: Observable<any> | undefined;\n\n  /**\n   * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n   */\n  operator: Operator<any, T> | undefined;\n\n  /**\n   * @constructor\n   * @param {Function} subscribe the function that is called when the Observable is\n   * initially subscribed to. This function is given a Subscriber, to which new values\n   * can be `next`ed, or an `error` method can be called to raise an error, or\n   * `complete` can be called to notify of a successful completion.\n   */\n  constructor(subscribe?: (this: Observable<T>, subscriber: Subscriber<T>) => TeardownLogic) {\n    if (subscribe) {\n      this._subscribe = subscribe;\n    }\n  }\n\n  // HACK: Since TypeScript inherits static properties too, we have to\n  // fight against TypeScript here so Subject can have a different static create signature\n  /**\n   * Creates a new Observable by calling the Observable constructor\n   * @owner Observable\n   * @method create\n   * @param {Function} subscribe? the subscriber function to be passed to the Observable constructor\n   * @return {Observable} a new observable\n   * @nocollapse\n   * @deprecated Use `new Observable()` instead. Will be removed in v8.\n   */\n  static create: (...args: any[]) => any = <T>(subscribe?: (subscriber: Subscriber<T>) => TeardownLogic) => {\n    return new Observable<T>(subscribe);\n  };\n\n  /**\n   * Creates a new Observable, with this Observable instance as the source, and the passed\n   * operator defined as the new observable's operator.\n   * @method lift\n   * @param operator the operator defining the operation to take on the observable\n   * @return a new observable with the Operator applied\n   * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n   * If you have implemented an operator using `lift`, it is recommended that you create an\n   * operator by simply returning `new Observable()` directly. See \"Creating new operators from\n   * scratch\" section here: https://rxjs.dev/guide/operators\n   */\n  lift<R>(operator?: Operator<T, R>): Observable<R> {\n    const observable = new Observable<R>();\n    observable.source = this;\n    observable.operator = operator;\n    return observable;\n  }\n\n  subscribe(observerOrNext?: Partial<Observer<T>> | ((value: T) => void)): Subscription;\n  /** @deprecated Instead of passing separate callback arguments, use an observer argument. Signatures taking separate callback arguments will be removed in v8. Details: https://rxjs.dev/deprecations/subscribe-arguments */\n  subscribe(next?: ((value: T) => void) | null, error?: ((error: any) => void) | null, complete?: (() => void) | null): Subscription;\n  /**\n   * Invokes an execution of an Observable and registers Observer handlers for notifications it will emit.\n   *\n   * <span class=\"informal\">Use it when you have all these Observables, but still nothing is happening.</span>\n   *\n   * `subscribe` is not a regular operator, but a method that calls Observable's internal `subscribe` function. It\n   * might be for example a function that you passed to Observable's constructor, but most of the time it is\n   * a library implementation, which defines what will be emitted by an Observable, and when it be will emitted. This means\n   * that calling `subscribe` is actually the moment when Observable starts its work, not when it is created, as it is often\n   * the thought.\n   *\n   * Apart from starting the execution of an Observable, this method allows you to listen for values\n   * that an Observable emits, as well as for when it completes or errors. You can achieve this in two\n   * of the following ways.\n   *\n   * The first way is creating an object that implements {@link Observer} interface. It should have methods\n   * defined by that interface, but note that it should be just a regular JavaScript object, which you can create\n   * yourself in any way you want (ES6 class, classic function constructor, object literal etc.). In particular, do\n   * not attempt to use any RxJS implementation details to create Observers - you don't need them. Remember also\n   * that your object does not have to implement all methods. If you find yourself creating a method that doesn't\n   * do anything, you can simply omit it. Note however, if the `error` method is not provided and an error happens,\n   * it will be thrown asynchronously. Errors thrown asynchronously cannot be caught using `try`/`catch`. Instead,\n   * use the {@link onUnhandledError} configuration option or use a runtime handler (like `window.onerror` or\n   * `process.on('error)`) to be notified of unhandled errors. Because of this, it's recommended that you provide\n   * an `error` method to avoid missing thrown errors.\n   *\n   * The second way is to give up on Observer object altogether and simply provide callback functions in place of its methods.\n   * This means you can provide three functions as arguments to `subscribe`, where the first function is equivalent\n   * of a `next` method, the second of an `error` method and the third of a `complete` method. Just as in case of an Observer,\n   * if you do not need to listen for something, you can omit a function by passing `undefined` or `null`,\n   * since `subscribe` recognizes these functions by where they were placed in function call. When it comes\n   * to the `error` function, as with an Observer, if not provided, errors emitted by an Observable will be thrown asynchronously.\n   *\n   * You can, however, subscribe with no parameters at all. This may be the case where you're not interested in terminal events\n   * and you also handled emissions internally by using operators (e.g. using `tap`).\n   *\n   * Whichever style of calling `subscribe` you use, in both cases it returns a Subscription object.\n   * This object allows you to call `unsubscribe` on it, which in turn will stop the work that an Observable does and will clean\n   * up all resources that an Observable used. Note that cancelling a subscription will not call `complete` callback\n   * provided to `subscribe` function, which is reserved for a regular completion signal that comes from an Observable.\n   *\n   * Remember that callbacks provided to `subscribe` are not guaranteed to be called asynchronously.\n   * It is an Observable itself that decides when these functions will be called. For example {@link of}\n   * by default emits all its values synchronously. Always check documentation for how given Observable\n   * will behave when subscribed and if its default behavior can be modified with a `scheduler`.\n   *\n   * #### Examples\n   *\n   * Subscribe with an {@link guide/observer Observer}\n   *\n   * ```ts\n   * import { of } from 'rxjs';\n   *\n   * const sumObserver = {\n   *   sum: 0,\n   *   next(value) {\n   *     console.log('Adding: ' + value);\n   *     this.sum = this.sum + value;\n   *   },\n   *   error() {\n   *     // We actually could just remove this method,\n   *     // since we do not really care about errors right now.\n   *   },\n   *   complete() {\n   *     console.log('Sum equals: ' + this.sum);\n   *   }\n   * };\n   *\n   * of(1, 2, 3) // Synchronously emits 1, 2, 3 and then completes.\n   *   .subscribe(sumObserver);\n   *\n   * // Logs:\n   * // 'Adding: 1'\n   * // 'Adding: 2'\n   * // 'Adding: 3'\n   * // 'Sum equals: 6'\n   * ```\n   *\n   * Subscribe with functions ({@link deprecations/subscribe-arguments deprecated})\n   *\n   * ```ts\n   * import { of } from 'rxjs'\n   *\n   * let sum = 0;\n   *\n   * of(1, 2, 3).subscribe(\n   *   value => {\n   *     console.log('Adding: ' + value);\n   *     sum = sum + value;\n   *   },\n   *   undefined,\n   *   () => console.log('Sum equals: ' + sum)\n   * );\n   *\n   * // Logs:\n   * // 'Adding: 1'\n   * // 'Adding: 2'\n   * // 'Adding: 3'\n   * // 'Sum equals: 6'\n   * ```\n   *\n   * Cancel a subscription\n   *\n   * ```ts\n   * import { interval } from 'rxjs';\n   *\n   * const subscription = interval(1000).subscribe({\n   *   next(num) {\n   *     console.log(num)\n   *   },\n   *   complete() {\n   *     // Will not be called, even when cancelling subscription.\n   *     console.log('completed!');\n   *   }\n   * });\n   *\n   * setTimeout(() => {\n   *   subscription.unsubscribe();\n   *   console.log('unsubscribed!');\n   * }, 2500);\n   *\n   * // Logs:\n   * // 0 after 1s\n   * // 1 after 2s\n   * // 'unsubscribed!' after 2.5s\n   * ```\n   *\n   * @param {Observer|Function} observerOrNext (optional) Either an observer with methods to be called,\n   * or the first of three possible handlers, which is the handler for each value emitted from the subscribed\n   * Observable.\n   * @param {Function} error (optional) A handler for a terminal event resulting from an error. If no error handler is provided,\n   * the error will be thrown asynchronously as unhandled.\n   * @param {Function} complete (optional) A handler for a terminal event resulting from successful completion.\n   * @return {Subscription} a subscription reference to the registered handlers\n   * @method subscribe\n   */\n  subscribe(\n    observerOrNext?: Partial<Observer<T>> | ((value: T) => void) | null,\n    error?: ((error: any) => void) | null,\n    complete?: (() => void) | null\n  ): Subscription {\n    const subscriber = isSubscriber(observerOrNext) ? observerOrNext : new SafeSubscriber(observerOrNext, error, complete);\n\n    errorContext(() => {\n      const { operator, source } = this;\n      subscriber.add(\n        operator\n          ? // We're dealing with a subscription in the\n            // operator chain to one of our lifted operators.\n            operator.call(subscriber, source)\n          : source\n          ? // If `source` has a value, but `operator` does not, something that\n            // had intimate knowledge of our API, like our `Subject`, must have\n            // set it. We're going to just call `_subscribe` directly.\n            this._subscribe(subscriber)\n          : // In all other cases, we're likely wrapping a user-provided initializer\n            // function, so we need to catch errors and handle them appropriately.\n            this._trySubscribe(subscriber)\n      );\n    });\n\n    return subscriber;\n  }\n\n  /** @internal */\n  protected _trySubscribe(sink: Subscriber<T>): TeardownLogic {\n    try {\n      return this._subscribe(sink);\n    } catch (err) {\n      // We don't need to return anything in this case,\n      // because it's just going to try to `add()` to a subscription\n      // above.\n      sink.error(err);\n    }\n  }\n\n  /**\n   * Used as a NON-CANCELLABLE means of subscribing to an observable, for use with\n   * APIs that expect promises, like `async/await`. You cannot unsubscribe from this.\n   *\n   * **WARNING**: Only use this with observables you *know* will complete. If the source\n   * observable does not complete, you will end up with a promise that is hung up, and\n   * potentially all of the state of an async function hanging out in memory. To avoid\n   * this situation, look into adding something like {@link timeout}, {@link take},\n   * {@link takeWhile}, or {@link takeUntil} amongst others.\n   *\n   * #### Example\n   *\n   * ```ts\n   * import { interval, take } from 'rxjs';\n   *\n   * const source$ = interval(1000).pipe(take(4));\n   *\n   * async function getTotal() {\n   *   let total = 0;\n   *\n   *   await source$.forEach(value => {\n   *     total += value;\n   *     console.log('observable -> ' + value);\n   *   });\n   *\n   *   return total;\n   * }\n   *\n   * getTotal().then(\n   *   total => console.log('Total: ' + total)\n   * );\n   *\n   * // Expected:\n   * // 'observable -> 0'\n   * // 'observable -> 1'\n   * // 'observable -> 2'\n   * // 'observable -> 3'\n   * // 'Total: 6'\n   * ```\n   *\n   * @param next a handler for each value emitted by the observable\n   * @return a promise that either resolves on observable completion or\n   *  rejects with the handled error\n   */\n  forEach(next: (value: T) => void): Promise<void>;\n\n  /**\n   * @param next a handler for each value emitted by the observable\n   * @param promiseCtor a constructor function used to instantiate the Promise\n   * @return a promise that either resolves on observable completion or\n   *  rejects with the handled error\n   * @deprecated Passing a Promise constructor will no longer be available\n   * in upcoming versions of RxJS. This is because it adds weight to the library, for very\n   * little benefit. If you need this functionality, it is recommended that you either\n   * polyfill Promise, or you create an adapter to convert the returned native promise\n   * to whatever promise implementation you wanted. Will be removed in v8.\n   */\n  forEach(next: (value: T) => void, promiseCtor: PromiseConstructorLike): Promise<void>;\n\n  forEach(next: (value: T) => void, promiseCtor?: PromiseConstructorLike): Promise<void> {\n    promiseCtor = getPromiseCtor(promiseCtor);\n\n    return new promiseCtor<void>((resolve, reject) => {\n      const subscriber = new SafeSubscriber<T>({\n        next: (value) => {\n          try {\n            next(value);\n          } catch (err) {\n            reject(err);\n            subscriber.unsubscribe();\n          }\n        },\n        error: reject,\n        complete: resolve,\n      });\n      this.subscribe(subscriber);\n    }) as Promise<void>;\n  }\n\n  /** @internal */\n  protected _subscribe(subscriber: Subscriber<any>): TeardownLogic {\n    return this.source?.subscribe(subscriber);\n  }\n\n  /**\n   * An interop point defined by the es7-observable spec https://github.com/zenparsing/es-observable\n   * @method Symbol.observable\n   * @return {Observable} this instance of the observable\n   */\n  [Symbol_observable]() {\n    return this;\n  }\n\n  /* tslint:disable:max-line-length */\n  pipe(): Observable<T>;\n  pipe<A>(op1: OperatorFunction<T, A>): Observable<A>;\n  pipe<A, B>(op1: OperatorFunction<T, A>, op2: OperatorFunction<A, B>): Observable<B>;\n  pipe<A, B, C>(op1: OperatorFunction<T, A>, op2: OperatorFunction<A, B>, op3: OperatorFunction<B, C>): Observable<C>;\n  pipe<A, B, C, D>(\n    op1: OperatorFunction<T, A>,\n    op2: OperatorFunction<A, B>,\n    op3: OperatorFunction<B, C>,\n    op4: OperatorFunction<C, D>\n  ): Observable<D>;\n  pipe<A, B, C, D, E>(\n    op1: OperatorFunction<T, A>,\n    op2: OperatorFunction<A, B>,\n    op3: OperatorFunction<B, C>,\n    op4: OperatorFunction<C, D>,\n    op5: OperatorFunction<D, E>\n  ): Observable<E>;\n  pipe<A, B, C, D, E, F>(\n    op1: OperatorFunction<T, A>,\n    op2: OperatorFunction<A, B>,\n    op3: OperatorFunction<B, C>,\n    op4: OperatorFunction<C, D>,\n    op5: OperatorFunction<D, E>,\n    op6: OperatorFunction<E, F>\n  ): Observable<F>;\n  pipe<A, B, C, D, E, F, G>(\n    op1: OperatorFunction<T, A>,\n    op2: OperatorFunction<A, B>,\n    op3: OperatorFunction<B, C>,\n    op4: OperatorFunction<C, D>,\n    op5: OperatorFunction<D, E>,\n    op6: OperatorFunction<E, F>,\n    op7: OperatorFunction<F, G>\n  ): Observable<G>;\n  pipe<A, B, C, D, E, F, G, H>(\n    op1: OperatorFunction<T, A>,\n    op2: OperatorFunction<A, B>,\n    op3: OperatorFunction<B, C>,\n    op4: OperatorFunction<C, D>,\n    op5: OperatorFunction<D, E>,\n    op6: OperatorFunction<E, F>,\n    op7: OperatorFunction<F, G>,\n    op8: OperatorFunction<G, H>\n  ): Observable<H>;\n  pipe<A, B, C, D, E, F, G, H, I>(\n    op1: OperatorFunction<T, A>,\n    op2: OperatorFunction<A, B>,\n    op3: OperatorFunction<B, C>,\n    op4: OperatorFunction<C, D>,\n    op5: OperatorFunction<D, E>,\n    op6: OperatorFunction<E, F>,\n    op7: OperatorFunction<F, G>,\n    op8: OperatorFunction<G, H>,\n    op9: OperatorFunction<H, I>\n  ): Observable<I>;\n  pipe<A, B, C, D, E, F, G, H, I>(\n    op1: OperatorFunction<T, A>,\n    op2: OperatorFunction<A, B>,\n    op3: OperatorFunction<B, C>,\n    op4: OperatorFunction<C, D>,\n    op5: OperatorFunction<D, E>,\n    op6: OperatorFunction<E, F>,\n    op7: OperatorFunction<F, G>,\n    op8: OperatorFunction<G, H>,\n    op9: OperatorFunction<H, I>,\n    ...operations: OperatorFunction<any, any>[]\n  ): Observable<unknown>;\n  /* tslint:enable:max-line-length */\n\n  /**\n   * Used to stitch together functional operators into a chain.\n   * @method pipe\n   * @return {Observable} the Observable result of all of the operators having\n   * been called in the order they were passed in.\n   *\n   * ## Example\n   *\n   * ```ts\n   * import { interval, filter, map, scan } from 'rxjs';\n   *\n   * interval(1000)\n   *   .pipe(\n   *     filter(x => x % 2 === 0),\n   *     map(x => x + x),\n   *     scan((acc, x) => acc + x)\n   *   )\n   *   .subscribe(x => console.log(x));\n   * ```\n   */\n  pipe(...operations: OperatorFunction<any, any>[]): Observable<any> {\n    return pipeFromArray(operations)(this);\n  }\n\n  /* tslint:disable:max-line-length */\n  /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n  toPromise(): Promise<T | undefined>;\n  /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n  toPromise(PromiseCtor: typeof Promise): Promise<T | undefined>;\n  /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n  toPromise(PromiseCtor: PromiseConstructorLike): Promise<T | undefined>;\n  /* tslint:enable:max-line-length */\n\n  /**\n   * Subscribe to this Observable and get a Promise resolving on\n   * `complete` with the last emission (if any).\n   *\n   * **WARNING**: Only use this with observables you *know* will complete. If the source\n   * observable does not complete, you will end up with a promise that is hung up, and\n   * potentially all of the state of an async function hanging out in memory. To avoid\n   * this situation, look into adding something like {@link timeout}, {@link take},\n   * {@link takeWhile}, or {@link takeUntil} amongst others.\n   *\n   * @method toPromise\n   * @param [promiseCtor] a constructor function used to instantiate\n   * the Promise\n   * @return A Promise that resolves with the last value emit, or\n   * rejects on an error. If there were no emissions, Promise\n   * resolves with undefined.\n   * @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise\n   */\n  toPromise(promiseCtor?: PromiseConstructorLike): Promise<T | undefined> {\n    promiseCtor = getPromiseCtor(promiseCtor);\n\n    return new promiseCtor((resolve, reject) => {\n      let value: T | undefined;\n      this.subscribe(\n        (x: T) => (value = x),\n        (err: any) => reject(err),\n        () => resolve(value)\n      );\n    }) as Promise<T | undefined>;\n  }\n}\n\n/**\n * Decides between a passed promise constructor from consuming code,\n * A default configured promise constructor, and the native promise\n * constructor and returns it. If nothing can be found, it will throw\n * an error.\n * @param promiseCtor The optional promise constructor to passed by consuming code\n */\nfunction getPromiseCtor(promiseCtor: PromiseConstructorLike | undefined) {\n  return promiseCtor ?? config.Promise ?? Promise;\n}\n\nfunction isObserver<T>(value: any): value is Observer<T> {\n  return value && isFunction(value.next) && isFunction(value.error) && isFunction(value.complete);\n}\n\nfunction isSubscriber<T>(value: any): value is Subscriber<T> {\n  return (value && value instanceof Subscriber) || (isObserver(value) && isSubscription(value));\n}\n", "import { Observable } from '../Observable';\nimport { Subscriber } from '../Subscriber';\nimport { OperatorFunction } from '../types';\nimport { isFunction } from './isFunction';\n\n/**\n * Used to determine if an object is an Observable with a lift function.\n */\nexport function hasLift(source: any): source is { lift: InstanceType<typeof Observable>['lift'] } {\n  return isFunction(source?.lift);\n}\n\n/**\n * Creates an `OperatorFunction`. Used to define operators throughout the library in a concise way.\n * @param init The logic to connect the liftedSource to the subscriber at the moment of subscription.\n */\nexport function operate<T, R>(\n  init: (liftedSource: Observable<T>, subscriber: Subscriber<R>) => (() => void) | void\n): OperatorFunction<T, R> {\n  return (source: Observable<T>) => {\n    if (hasLift(source)) {\n      return source.lift(function (this: Subscriber<R>, liftedSource: Observable<T>) {\n        try {\n          return init(liftedSource, this);\n        } catch (err) {\n          this.error(err);\n        }\n      });\n    }\n    throw new TypeError('Unable to lift unknown Observable type');\n  };\n}\n", "import { Subscriber } from '../Subscriber';\n\n/**\n * Creates an instance of an `OperatorSubscriber`.\n * @param destination The downstream subscriber.\n * @param onNext Handles next values, only called if this subscriber is not stopped or closed. Any\n * error that occurs in this function is caught and sent to the `error` method of this subscriber.\n * @param onError Handles errors from the subscription, any errors that occur in this handler are caught\n * and send to the `destination` error handler.\n * @param onComplete Handles completion notification from the subscription. Any errors that occur in\n * this handler are sent to the `destination` error handler.\n * @param onFinalize Additional teardown logic here. This will only be called on teardown if the\n * subscriber itself is not already closed. This is called after all other teardown logic is executed.\n */\nexport function createOperatorSubscriber<T>(\n  destination: Subscriber<any>,\n  onNext?: (value: T) => void,\n  onComplete?: () => void,\n  onError?: (err: any) => void,\n  onFinalize?: () => void\n): Subscriber<T> {\n  return new OperatorSubscriber(destination, onNext, onComplete, onError, onFinalize);\n}\n\n/**\n * A generic helper for allowing operators to be created with a Subscriber and\n * use closures to capture necessary state from the operator function itself.\n */\nexport class OperatorSubscriber<T> extends Subscriber<T> {\n  /**\n   * Creates an instance of an `OperatorSubscriber`.\n   * @param destination The downstream subscriber.\n   * @param onNext Handles next values, only called if this subscriber is not stopped or closed. Any\n   * error that occurs in this function is caught and sent to the `error` method of this subscriber.\n   * @param onError Handles errors from the subscription, any errors that occur in this handler are caught\n   * and send to the `destination` error handler.\n   * @param onComplete Handles completion notification from the subscription. Any errors that occur in\n   * this handler are sent to the `destination` error handler.\n   * @param onFinalize Additional finalization logic here. This will only be called on finalization if the\n   * subscriber itself is not already closed. This is called after all other finalization logic is executed.\n   * @param shouldUnsubscribe An optional check to see if an unsubscribe call should truly unsubscribe.\n   * NOTE: This currently **ONLY** exists to support the strange behavior of {@link groupBy}, where unsubscription\n   * to the resulting observable does not actually disconnect from the source if there are active subscriptions\n   * to any grouped observable. (DO NOT EXPOSE OR USE EXTERNALLY!!!)\n   */\n  constructor(\n    destination: Subscriber<any>,\n    onNext?: (value: T) => void,\n    onComplete?: () => void,\n    onError?: (err: any) => void,\n    private onFinalize?: () => void,\n    private shouldUnsubscribe?: () => boolean\n  ) {\n    // It's important - for performance reasons - that all of this class's\n    // members are initialized and that they are always initialized in the same\n    // order. This will ensure that all OperatorSubscriber instances have the\n    // same hidden class in V8. This, in turn, will help keep the number of\n    // hidden classes involved in property accesses within the base class as\n    // low as possible. If the number of hidden classes involved exceeds four,\n    // the property accesses will become megamorphic and performance penalties\n    // will be incurred - i.e. inline caches won't be used.\n    //\n    // The reasons for ensuring all instances have the same hidden class are\n    // further discussed in this blog post from Benedikt Meurer:\n    // https://benediktmeurer.de/2018/03/23/impact-of-polymorphism-on-component-based-frameworks-like-react/\n    super(destination);\n    this._next = onNext\n      ? function (this: OperatorSubscriber<T>, value: T) {\n          try {\n            onNext(value);\n          } catch (err) {\n            destination.error(err);\n          }\n        }\n      : super._next;\n    this._error = onError\n      ? function (this: OperatorSubscriber<T>, err: any) {\n          try {\n            onError(err);\n          } catch (err) {\n            // Send any errors that occur down stream.\n            destination.error(err);\n          } finally {\n            // Ensure finalization.\n            this.unsubscribe();\n          }\n        }\n      : super._error;\n    this._complete = onComplete\n      ? function (this: OperatorSubscriber<T>) {\n          try {\n            onComplete();\n          } catch (err) {\n            // Send any errors that occur down stream.\n            destination.error(err);\n          } finally {\n            // Ensure finalization.\n            this.unsubscribe();\n          }\n        }\n      : super._complete;\n  }\n\n  unsubscribe() {\n    if (!this.shouldUnsubscribe || this.shouldUnsubscribe()) {\n      const { closed } = this;\n      super.unsubscribe();\n      // Execute additional teardown if we have any and we didn't already do so.\n      !closed && this.onFinalize?.();\n    }\n  }\n}\n", "import { Subscription } from '../Subscription';\n\ninterface AnimationFrameProvider {\n  schedule(callback: FrameRequestCallback): Subscription;\n  requestAnimationFrame: typeof requestAnimationFrame;\n  cancelAnimationFrame: typeof cancelAnimationFrame;\n  delegate:\n    | {\n        requestAnimationFrame: typeof requestAnimationFrame;\n        cancelAnimationFrame: typeof cancelAnimationFrame;\n      }\n    | undefined;\n}\n\nexport const animationFrameProvider: AnimationFrameProvider = {\n  // When accessing the delegate, use the variable rather than `this` so that\n  // the functions can be called without being bound to the provider.\n  schedule(callback) {\n    let request = requestAnimationFrame;\n    let cancel: typeof cancelAnimationFrame | undefined = cancelAnimationFrame;\n    const { delegate } = animationFrameProvider;\n    if (delegate) {\n      request = delegate.requestAnimationFrame;\n      cancel = delegate.cancelAnimationFrame;\n    }\n    const handle = request((timestamp) => {\n      // Clear the cancel function. The request has been fulfilled, so\n      // attempting to cancel the request upon unsubscription would be\n      // pointless.\n      cancel = undefined;\n      callback(timestamp);\n    });\n    return new Subscription(() => cancel?.(handle));\n  },\n  requestAnimationFrame(...args) {\n    const { delegate } = animationFrameProvider;\n    return (delegate?.requestAnimationFrame || requestAnimationFrame)(...args);\n  },\n  cancelAnimationFrame(...args) {\n    const { delegate } = animationFrameProvider;\n    return (delegate?.cancelAnimationFrame || cancelAnimationFrame)(...args);\n  },\n  delegate: undefined,\n};\n", "import { createErrorClass } from './createErrorClass';\n\nexport interface ObjectUnsubscribedError extends Error {}\n\nexport interface ObjectUnsubscribedErrorCtor {\n  /**\n   * @deprecated Internal implementation detail. Do not construct error instances.\n   * Cannot be tagged as internal: https://github.com/ReactiveX/rxjs/issues/6269\n   */\n  new (): ObjectUnsubscribedError;\n}\n\n/**\n * An error thrown when an action is invalid because the object has been\n * unsubscribed.\n *\n * @see {@link Subject}\n * @see {@link BehaviorSubject}\n *\n * @class ObjectUnsubscribedError\n */\nexport const ObjectUnsubscribedError: ObjectUnsubscribedErrorCtor = createErrorClass(\n  (_super) =>\n    function ObjectUnsubscribedErrorImpl(this: any) {\n      _super(this);\n      this.name = 'ObjectUnsubscribedError';\n      this.message = 'object unsubscribed';\n    }\n);\n", "import { Operator } from './Operator';\nimport { Observable } from './Observable';\nimport { Subscriber } from './Subscriber';\nimport { Subscription, EMPTY_SUBSCRIPTION } from './Subscription';\nimport { Observer, SubscriptionLike, TeardownLogic } from './types';\nimport { ObjectUnsubscribedError } from './util/ObjectUnsubscribedError';\nimport { arrRemove } from './util/arrRemove';\nimport { errorContext } from './util/errorContext';\n\n/**\n * A Subject is a special type of Observable that allows values to be\n * multicasted to many Observers. Subjects are like EventEmitters.\n *\n * Every Subject is an Observable and an Observer. You can subscribe to a\n * Subject, and you can call next to feed values as well as error and complete.\n */\nexport class Subject<T> extends Observable<T> implements SubscriptionLike {\n  closed = false;\n\n  private currentObservers: Observer<T>[] | null = null;\n\n  /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n  observers: Observer<T>[] = [];\n  /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n  isStopped = false;\n  /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n  hasError = false;\n  /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n  thrownError: any = null;\n\n  /**\n   * Creates a \"subject\" by basically gluing an observer to an observable.\n   *\n   * @nocollapse\n   * @deprecated Recommended you do not use. Will be removed at some point in the future. Plans for replacement still under discussion.\n   */\n  static create: (...args: any[]) => any = <T>(destination: Observer<T>, source: Observable<T>): AnonymousSubject<T> => {\n    return new AnonymousSubject<T>(destination, source);\n  };\n\n  constructor() {\n    // NOTE: This must be here to obscure Observable's constructor.\n    super();\n  }\n\n  /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n  lift<R>(operator: Operator<T, R>): Observable<R> {\n    const subject = new AnonymousSubject(this, this);\n    subject.operator = operator as any;\n    return subject as any;\n  }\n\n  /** @internal */\n  protected _throwIfClosed() {\n    if (this.closed) {\n      throw new ObjectUnsubscribedError();\n    }\n  }\n\n  next(value: T) {\n    errorContext(() => {\n      this._throwIfClosed();\n      if (!this.isStopped) {\n        if (!this.currentObservers) {\n          this.currentObservers = Array.from(this.observers);\n        }\n        for (const observer of this.currentObservers) {\n          observer.next(value);\n        }\n      }\n    });\n  }\n\n  error(err: any) {\n    errorContext(() => {\n      this._throwIfClosed();\n      if (!this.isStopped) {\n        this.hasError = this.isStopped = true;\n        this.thrownError = err;\n        const { observers } = this;\n        while (observers.length) {\n          observers.shift()!.error(err);\n        }\n      }\n    });\n  }\n\n  complete() {\n    errorContext(() => {\n      this._throwIfClosed();\n      if (!this.isStopped) {\n        this.isStopped = true;\n        const { observers } = this;\n        while (observers.length) {\n          observers.shift()!.complete();\n        }\n      }\n    });\n  }\n\n  unsubscribe() {\n    this.isStopped = this.closed = true;\n    this.observers = this.currentObservers = null!;\n  }\n\n  get observed() {\n    return this.observers?.length > 0;\n  }\n\n  /** @internal */\n  protected _trySubscribe(subscriber: Subscriber<T>): TeardownLogic {\n    this._throwIfClosed();\n    return super._trySubscribe(subscriber);\n  }\n\n  /** @internal */\n  protected _subscribe(subscriber: Subscriber<T>): Subscription {\n    this._throwIfClosed();\n    this._checkFinalizedStatuses(subscriber);\n    return this._innerSubscribe(subscriber);\n  }\n\n  /** @internal */\n  protected _innerSubscribe(subscriber: Subscriber<any>) {\n    const { hasError, isStopped, observers } = this;\n    if (hasError || isStopped) {\n      return EMPTY_SUBSCRIPTION;\n    }\n    this.currentObservers = null;\n    observers.push(subscriber);\n    return new Subscription(() => {\n      this.currentObservers = null;\n      arrRemove(observers, subscriber);\n    });\n  }\n\n  /** @internal */\n  protected _checkFinalizedStatuses(subscriber: Subscriber<any>) {\n    const { hasError, thrownError, isStopped } = this;\n    if (hasError) {\n      subscriber.error(thrownError);\n    } else if (isStopped) {\n      subscriber.complete();\n    }\n  }\n\n  /**\n   * Creates a new Observable with this Subject as the source. You can do this\n   * to create custom Observer-side logic of the Subject and conceal it from\n   * code that uses the Observable.\n   * @return {Observable} Observable that the Subject casts to\n   */\n  asObservable(): Observable<T> {\n    const observable: any = new Observable<T>();\n    observable.source = this;\n    return observable;\n  }\n}\n\n/**\n * @class AnonymousSubject<T>\n */\nexport class AnonymousSubject<T> extends Subject<T> {\n  constructor(\n    /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n    public destination?: Observer<T>,\n    source?: Observable<T>\n  ) {\n    super();\n    this.source = source;\n  }\n\n  next(value: T) {\n    this.destination?.next?.(value);\n  }\n\n  error(err: any) {\n    this.destination?.error?.(err);\n  }\n\n  complete() {\n    this.destination?.complete?.();\n  }\n\n  /** @internal */\n  protected _subscribe(subscriber: Subscriber<T>): Subscription {\n    return this.source?.subscribe(subscriber) ?? EMPTY_SUBSCRIPTION;\n  }\n}\n", "import { Subject } from './Subject';\nimport { Subscriber } from './Subscriber';\nimport { Subscription } from './Subscription';\n\n/**\n * A variant of Subject that requires an initial value and emits its current\n * value whenever it is subscribed to.\n *\n * @class BehaviorSubject<T>\n */\nexport class BehaviorSubject<T> extends Subject<T> {\n  constructor(private _value: T) {\n    super();\n  }\n\n  get value(): T {\n    return this.getValue();\n  }\n\n  /** @internal */\n  protected _subscribe(subscriber: Subscriber<T>): Subscription {\n    const subscription = super._subscribe(subscriber);\n    !subscription.closed && subscriber.next(this._value);\n    return subscription;\n  }\n\n  getValue(): T {\n    const { hasError, thrownError, _value } = this;\n    if (hasError) {\n      throw thrownError;\n    }\n    this._throwIfClosed();\n    return _value;\n  }\n\n  next(value: T): void {\n    super.next((this._value = value));\n  }\n}\n", "import { TimestampProvider } from '../types';\n\ninterface DateTimestampProvider extends TimestampProvider {\n  delegate: TimestampProvider | undefined;\n}\n\nexport const dateTimestampProvider: DateTimestampProvider = {\n  now() {\n    // Use the variable rather than `this` so that the function can be called\n    // without being bound to the provider.\n    return (dateTimestampProvider.delegate || Date).now();\n  },\n  delegate: undefined,\n};\n", "import { Subject } from './Subject';\nimport { TimestampProvider } from './types';\nimport { Subscriber } from './Subscriber';\nimport { Subscription } from './Subscription';\nimport { dateTimestampProvider } from './scheduler/dateTimestampProvider';\n\n/**\n * A variant of {@link Subject} that \"replays\" old values to new subscribers by emitting them when they first subscribe.\n *\n * `ReplaySubject` has an internal buffer that will store a specified number of values that it has observed. Like `Subject`,\n * `ReplaySubject` \"observes\" values by having them passed to its `next` method. When it observes a value, it will store that\n * value for a time determined by the configuration of the `ReplaySubject`, as passed to its constructor.\n *\n * When a new subscriber subscribes to the `ReplaySubject` instance, it will synchronously emit all values in its buffer in\n * a First-In-First-Out (FIFO) manner. The `ReplaySubject` will also complete, if it has observed completion; and it will\n * error if it has observed an error.\n *\n * There are two main configuration items to be concerned with:\n *\n * 1. `bufferSize` - This will determine how many items are stored in the buffer, defaults to infinite.\n * 2. `windowTime` - The amount of time to hold a value in the buffer before removing it from the buffer.\n *\n * Both configurations may exist simultaneously. So if you would like to buffer a maximum of 3 values, as long as the values\n * are less than 2 seconds old, you could do so with a `new ReplaySubject(3, 2000)`.\n *\n * ### Differences with BehaviorSubject\n *\n * `BehaviorSubject` is similar to `new ReplaySubject(1)`, with a couple of exceptions:\n *\n * 1. `BehaviorSubject` comes \"primed\" with a single value upon construction.\n * 2. `ReplaySubject` will replay values, even after observing an error, where `BehaviorSubject` will not.\n *\n * @see {@link Subject}\n * @see {@link BehaviorSubject}\n * @see {@link shareReplay}\n */\nexport class ReplaySubject<T> extends Subject<T> {\n  private _buffer: (T | number)[] = [];\n  private _infiniteTimeWindow = true;\n\n  /**\n   * @param bufferSize The size of the buffer to replay on subscription\n   * @param windowTime The amount of time the buffered items will stay buffered\n   * @param timestampProvider An object with a `now()` method that provides the current timestamp. This is used to\n   * calculate the amount of time something has been buffered.\n   */\n  constructor(\n    private _bufferSize = Infinity,\n    private _windowTime = Infinity,\n    private _timestampProvider: TimestampProvider = dateTimestampProvider\n  ) {\n    super();\n    this._infiniteTimeWindow = _windowTime === Infinity;\n    this._bufferSize = Math.max(1, _bufferSize);\n    this._windowTime = Math.max(1, _windowTime);\n  }\n\n  next(value: T): void {\n    const { isStopped, _buffer, _infiniteTimeWindow, _timestampProvider, _windowTime } = this;\n    if (!isStopped) {\n      _buffer.push(value);\n      !_infiniteTimeWindow && _buffer.push(_timestampProvider.now() + _windowTime);\n    }\n    this._trimBuffer();\n    super.next(value);\n  }\n\n  /** @internal */\n  protected _subscribe(subscriber: Subscriber<T>): Subscription {\n    this._throwIfClosed();\n    this._trimBuffer();\n\n    const subscription = this._innerSubscribe(subscriber);\n\n    const { _infiniteTimeWindow, _buffer } = this;\n    // We use a copy here, so reentrant code does not mutate our array while we're\n    // emitting it to a new subscriber.\n    const copy = _buffer.slice();\n    for (let i = 0; i < copy.length && !subscriber.closed; i += _infiniteTimeWindow ? 1 : 2) {\n      subscriber.next(copy[i] as T);\n    }\n\n    this._checkFinalizedStatuses(subscriber);\n\n    return subscription;\n  }\n\n  private _trimBuffer() {\n    const { _bufferSize, _timestampProvider, _buffer, _infiniteTimeWindow } = this;\n    // If we don't have an infinite buffer size, and we're over the length,\n    // use splice to truncate the old buffer values off. Note that we have to\n    // double the size for instances where we're not using an infinite time window\n    // because we're storing the values and the timestamps in the same array.\n    const adjustedBufferSize = (_infiniteTimeWindow ? 1 : 2) * _bufferSize;\n    _bufferSize < Infinity && adjustedBufferSize < _buffer.length && _buffer.splice(0, _buffer.length - adjustedBufferSize);\n\n    // Now, if we're not in an infinite time window, remove all values where the time is\n    // older than what is allowed.\n    if (!_infiniteTimeWindow) {\n      const now = _timestampProvider.now();\n      let last = 0;\n      // Search the array for the first timestamp that isn't expired and\n      // truncate the buffer up to that point.\n      for (let i = 1; i < _buffer.length && (_buffer[i] as number) <= now; i += 2) {\n        last = i;\n      }\n      last && _buffer.splice(0, last + 1);\n    }\n  }\n}\n", "import { Scheduler } from '../Scheduler';\nimport { Subscription } from '../Subscription';\nimport { SchedulerAction } from '../types';\n\n/**\n * A unit of work to be executed in a `scheduler`. An action is typically\n * created from within a {@link SchedulerLike} and an RxJS user does not need to concern\n * themselves about creating and manipulating an Action.\n *\n * ```ts\n * class Action<T> extends Subscription {\n *   new (scheduler: Scheduler, work: (state?: T) => void);\n *   schedule(state?: T, delay: number = 0): Subscription;\n * }\n * ```\n *\n * @class Action<T>\n */\nexport class Action<T> extends Subscription {\n  constructor(scheduler: Scheduler, work: (this: SchedulerAction<T>, state?: T) => void) {\n    super();\n  }\n  /**\n   * Schedules this action on its parent {@link SchedulerLike} for execution. May be passed\n   * some context object, `state`. May happen at some point in the future,\n   * according to the `delay` parameter, if specified.\n   * @param {T} [state] Some contextual data that the `work` function uses when\n   * called by the Scheduler.\n   * @param {number} [delay] Time to wait before executing the work, where the\n   * time unit is implicit and defined by the Scheduler.\n   * @return {void}\n   */\n  public schedule(state?: T, delay: number = 0): Subscription {\n    return this;\n  }\n}\n", "import type { TimerHandle } from './timerHandle';\ntype SetIntervalFunction = (handler: () => void, timeout?: number, ...args: any[]) => TimerHandle;\ntype ClearIntervalFunction = (handle: TimerHandle) => void;\n\ninterface IntervalProvider {\n  setInterval: SetIntervalFunction;\n  clearInterval: ClearIntervalFunction;\n  delegate:\n    | {\n        setInterval: SetIntervalFunction;\n        clearInterval: ClearIntervalFunction;\n      }\n    | undefined;\n}\n\nexport const intervalProvider: IntervalProvider = {\n  // When accessing the delegate, use the variable rather than `this` so that\n  // the functions can be called without being bound to the provider.\n  setInterval(handler: () => void, timeout?: number, ...args) {\n    const { delegate } = intervalProvider;\n    if (delegate?.setInterval) {\n      return delegate.setInterval(handler, timeout, ...args);\n    }\n    return setInterval(handler, timeout, ...args);\n  },\n  clearInterval(handle) {\n    const { delegate } = intervalProvider;\n    return (delegate?.clearInterval || clearInterval)(handle as any);\n  },\n  delegate: undefined,\n};\n", "import { Action } from './Action';\nimport { SchedulerAction } from '../types';\nimport { Subscription } from '../Subscription';\nimport { AsyncScheduler } from './AsyncScheduler';\nimport { intervalProvider } from './intervalProvider';\nimport { arrRemove } from '../util/arrRemove';\nimport { TimerHandle } from './timerHandle';\n\nexport class AsyncAction<T> extends Action<T> {\n  public id: TimerHandle | undefined;\n  public state?: T;\n  // @ts-ignore: Property has no initializer and is not definitely assigned\n  public delay: number;\n  protected pending: boolean = false;\n\n  constructor(protected scheduler: AsyncScheduler, protected work: (this: SchedulerAction<T>, state?: T) => void) {\n    super(scheduler, work);\n  }\n\n  public schedule(state?: T, delay: number = 0): Subscription {\n    if (this.closed) {\n      return this;\n    }\n\n    // Always replace the current state with the new state.\n    this.state = state;\n\n    const id = this.id;\n    const scheduler = this.scheduler;\n\n    //\n    // Important implementation note:\n    //\n    // Actions only execute once by default, unless rescheduled from within the\n    // scheduled callback. This allows us to implement single and repeat\n    // actions via the same code path, without adding API surface area, as well\n    // as mimic traditional recursion but across asynchronous boundaries.\n    //\n    // However, JS runtimes and timers distinguish between intervals achieved by\n    // serial `setTimeout` calls vs. a single `setInterval` call. An interval of\n    // serial `setTimeout` calls can be individually delayed, which delays\n    // scheduling the next `setTimeout`, and so on. `setInterval` attempts to\n    // guarantee the interval callback will be invoked more precisely to the\n    // interval period, regardless of load.\n    //\n    // Therefore, we use `setInterval` to schedule single and repeat actions.\n    // If the action reschedules itself with the same delay, the interval is not\n    // canceled. If the action doesn't reschedule, or reschedules with a\n    // different delay, the interval will be canceled after scheduled callback\n    // execution.\n    //\n    if (id != null) {\n      this.id = this.recycleAsyncId(scheduler, id, delay);\n    }\n\n    // Set the pending flag indicating that this action has been scheduled, or\n    // has recursively rescheduled itself.\n    this.pending = true;\n\n    this.delay = delay;\n    // If this action has already an async Id, don't request a new one.\n    this.id = this.id ?? this.requestAsyncId(scheduler, this.id, delay);\n\n    return this;\n  }\n\n  protected requestAsyncId(scheduler: AsyncScheduler, _id?: TimerHandle, delay: number = 0): TimerHandle {\n    return intervalProvider.setInterval(scheduler.flush.bind(scheduler, this), delay);\n  }\n\n  protected recycleAsyncId(_scheduler: AsyncScheduler, id?: TimerHandle, delay: number | null = 0): TimerHandle | undefined {\n    // If this action is rescheduled with the same delay time, don't clear the interval id.\n    if (delay != null && this.delay === delay && this.pending === false) {\n      return id;\n    }\n    // Otherwise, if the action's delay time is different from the current delay,\n    // or the action has been rescheduled before it's executed, clear the interval id\n    if (id != null) {\n      intervalProvider.clearInterval(id);\n    }\n\n    return undefined;\n  }\n\n  /**\n   * Immediately executes this action and the `work` it contains.\n   * @return {any}\n   */\n  public execute(state: T, delay: number): any {\n    if (this.closed) {\n      return new Error('executing a cancelled action');\n    }\n\n    this.pending = false;\n    const error = this._execute(state, delay);\n    if (error) {\n      return error;\n    } else if (this.pending === false && this.id != null) {\n      // Dequeue if the action didn't reschedule itself. Don't call\n      // unsubscribe(), because the action could reschedule later.\n      // For example:\n      // ```\n      // scheduler.schedule(function doWork(counter) {\n      //   /* ... I'm a busy worker bee ... */\n      //   var originalAction = this;\n      //   /* wait 100ms before rescheduling the action */\n      //   setTimeout(function () {\n      //     originalAction.schedule(counter + 1);\n      //   }, 100);\n      // }, 1000);\n      // ```\n      this.id = this.recycleAsyncId(this.scheduler, this.id, null);\n    }\n  }\n\n  protected _execute(state: T, _delay: number): any {\n    let errored: boolean = false;\n    let errorValue: any;\n    try {\n      this.work(state);\n    } catch (e) {\n      errored = true;\n      // HACK: Since code elsewhere is relying on the \"truthiness\" of the\n      // return here, we can't have it return \"\" or 0 or false.\n      // TODO: Clean this up when we refactor schedulers mid-version-8 or so.\n      errorValue = e ? e : new Error('Scheduled action threw falsy error');\n    }\n    if (errored) {\n      this.unsubscribe();\n      return errorValue;\n    }\n  }\n\n  unsubscribe() {\n    if (!this.closed) {\n      const { id, scheduler } = this;\n      const { actions } = scheduler;\n\n      this.work = this.state = this.scheduler = null!;\n      this.pending = false;\n\n      arrRemove(actions, this);\n      if (id != null) {\n        this.id = this.recycleAsyncId(scheduler, id, null);\n      }\n\n      this.delay = null!;\n      super.unsubscribe();\n    }\n  }\n}\n", "import { Action } from './scheduler/Action';\nimport { Subscription } from './Subscription';\nimport { SchedulerLike, SchedulerAction } from './types';\nimport { dateTimestampProvider } from './scheduler/dateTimestampProvider';\n\n/**\n * An execution context and a data structure to order tasks and schedule their\n * execution. Provides a notion of (potentially virtual) time, through the\n * `now()` getter method.\n *\n * Each unit of work in a Scheduler is called an `Action`.\n *\n * ```ts\n * class Scheduler {\n *   now(): number;\n *   schedule(work, delay?, state?): Subscription;\n * }\n * ```\n *\n * @class Scheduler\n * @deprecated Scheduler is an internal implementation detail of RxJS, and\n * should not be used directly. Rather, create your own class and implement\n * {@link SchedulerLike}. Will be made internal in v8.\n */\nexport class Scheduler implements SchedulerLike {\n  public static now: () => number = dateTimestampProvider.now;\n\n  constructor(private schedulerActionCtor: typeof Action, now: () => number = Scheduler.now) {\n    this.now = now;\n  }\n\n  /**\n   * A getter method that returns a number representing the current time\n   * (at the time this function was called) according to the scheduler's own\n   * internal clock.\n   * @return {number} A number that represents the current time. May or may not\n   * have a relation to wall-clock time. May or may not refer to a time unit\n   * (e.g. milliseconds).\n   */\n  public now: () => number;\n\n  /**\n   * Schedules a function, `work`, for execution. May happen at some point in\n   * the future, according to the `delay` parameter, if specified. May be passed\n   * some context object, `state`, which will be passed to the `work` function.\n   *\n   * The given arguments will be processed an stored as an Action object in a\n   * queue of actions.\n   *\n   * @param {function(state: ?T): ?Subscription} work A function representing a\n   * task, or some unit of work to be executed by the Scheduler.\n   * @param {number} [delay] Time to wait before executing the work, where the\n   * time unit is implicit and defined by the Scheduler itself.\n   * @param {T} [state] Some contextual data that the `work` function uses when\n   * called by the Scheduler.\n   * @return {Subscription} A subscription in order to be able to unsubscribe\n   * the scheduled work.\n   */\n  public schedule<T>(work: (this: SchedulerAction<T>, state?: T) => void, delay: number = 0, state?: T): Subscription {\n    return new this.schedulerActionCtor<T>(this, work).schedule(state, delay);\n  }\n}\n", "import { Scheduler } from '../Scheduler';\nimport { Action } from './Action';\nimport { AsyncAction } from './AsyncAction';\nimport { TimerHandle } from './timerHandle';\n\nexport class AsyncScheduler extends Scheduler {\n  public actions: Array<AsyncAction<any>> = [];\n  /**\n   * A flag to indicate whether the Scheduler is currently executing a batch of\n   * queued actions.\n   * @type {boolean}\n   * @internal\n   */\n  public _active: boolean = false;\n  /**\n   * An internal ID used to track the latest asynchronous task such as those\n   * coming from `setTimeout`, `setInterval`, `requestAnimationFrame`, and\n   * others.\n   * @type {any}\n   * @internal\n   */\n  public _scheduled: TimerHandle | undefined;\n\n  constructor(SchedulerAction: typeof Action, now: () => number = Scheduler.now) {\n    super(SchedulerAction, now);\n  }\n\n  public flush(action: AsyncAction<any>): void {\n    const { actions } = this;\n\n    if (this._active) {\n      actions.push(action);\n      return;\n    }\n\n    let error: any;\n    this._active = true;\n\n    do {\n      if ((error = action.execute(action.state, action.delay))) {\n        break;\n      }\n    } while ((action = actions.shift()!)); // exhaust the scheduler queue\n\n    this._active = false;\n\n    if (error) {\n      while ((action = actions.shift()!)) {\n        action.unsubscribe();\n      }\n      throw error;\n    }\n  }\n}\n", "import { AsyncAction } from './AsyncAction';\nimport { AsyncScheduler } from './AsyncScheduler';\n\n/**\n *\n * Async Scheduler\n *\n * <span class=\"informal\">Schedule task as if you used setTimeout(task, duration)</span>\n *\n * `async` scheduler schedules tasks asynchronously, by putting them on the JavaScript\n * event loop queue. It is best used to delay tasks in time or to schedule tasks repeating\n * in intervals.\n *\n * If you just want to \"defer\" task, that is to perform it right after currently\n * executing synchronous code ends (commonly achieved by `setTimeout(deferredTask, 0)`),\n * better choice will be the {@link asapScheduler} scheduler.\n *\n * ## Examples\n * Use async scheduler to delay task\n * ```ts\n * import { asyncScheduler } from 'rxjs';\n *\n * const task = () => console.log('it works!');\n *\n * asyncScheduler.schedule(task, 2000);\n *\n * // After 2 seconds logs:\n * // \"it works!\"\n * ```\n *\n * Use async scheduler to repeat task in intervals\n * ```ts\n * import { asyncScheduler } from 'rxjs';\n *\n * function task(state) {\n *   console.log(state);\n *   this.schedule(state + 1, 1000); // `this` references currently executing Action,\n *                                   // which we reschedule with new state and delay\n * }\n *\n * asyncScheduler.schedule(task, 3000, 0);\n *\n * // Logs:\n * // 0 after 3s\n * // 1 after 4s\n * // 2 after 5s\n * // 3 after 6s\n * ```\n */\n\nexport const asyncScheduler = new AsyncScheduler(AsyncAction);\n\n/**\n * @deprecated Renamed to {@link asyncScheduler}. Will be removed in v8.\n */\nexport const async = asyncScheduler;\n", "import { AsyncAction } from './AsyncAction';\nimport { Subscription } from '../Subscription';\nimport { QueueScheduler } from './QueueScheduler';\nimport { SchedulerAction } from '../types';\nimport { TimerHandle } from './timerHandle';\n\nexport class QueueAction<T> extends AsyncAction<T> {\n  constructor(protected scheduler: QueueScheduler, protected work: (this: SchedulerAction<T>, state?: T) => void) {\n    super(scheduler, work);\n  }\n\n  public schedule(state?: T, delay: number = 0): Subscription {\n    if (delay > 0) {\n      return super.schedule(state, delay);\n    }\n    this.delay = delay;\n    this.state = state;\n    this.scheduler.flush(this);\n    return this;\n  }\n\n  public execute(state: T, delay: number): any {\n    return delay > 0 || this.closed ? super.execute(state, delay) : this._execute(state, delay);\n  }\n\n  protected requestAsyncId(scheduler: QueueScheduler, id?: TimerHandle, delay: number = 0): TimerHandle {\n    // If delay exists and is greater than 0, or if the delay is null (the\n    // action wasn't rescheduled) but was originally scheduled as an async\n    // action, then recycle as an async action.\n\n    if ((delay != null && delay > 0) || (delay == null && this.delay > 0)) {\n      return super.requestAsyncId(scheduler, id, delay);\n    }\n\n    // Otherwise flush the scheduler starting with this action.\n    scheduler.flush(this);\n\n    // HACK: In the past, this was returning `void`. However, `void` isn't a valid\n    // `TimerHandle`, and generally the return value here isn't really used. So the\n    // compromise is to return `0` which is both \"falsy\" and a valid `TimerHandle`,\n    // as opposed to refactoring every other instanceo of `requestAsyncId`.\n    return 0;\n  }\n}\n", "import { AsyncScheduler } from './AsyncScheduler';\n\nexport class QueueScheduler extends AsyncScheduler {\n}\n", "import { QueueAction } from './QueueAction';\nimport { QueueScheduler } from './QueueScheduler';\n\n/**\n *\n * Queue Scheduler\n *\n * <span class=\"informal\">Put every next task on a queue, instead of executing it immediately</span>\n *\n * `queue` scheduler, when used with delay, behaves the same as {@link asyncScheduler} scheduler.\n *\n * When used without delay, it schedules given task synchronously - executes it right when\n * it is scheduled. However when called recursively, that is when inside the scheduled task,\n * another task is scheduled with queue scheduler, instead of executing immediately as well,\n * that task will be put on a queue and wait for current one to finish.\n *\n * This means that when you execute task with `queue` scheduler, you are sure it will end\n * before any other task scheduled with that scheduler will start.\n *\n * ## Examples\n * Schedule recursively first, then do something\n * ```ts\n * import { queueScheduler } from 'rxjs';\n *\n * queueScheduler.schedule(() => {\n *   queueScheduler.schedule(() => console.log('second')); // will not happen now, but will be put on a queue\n *\n *   console.log('first');\n * });\n *\n * // Logs:\n * // \"first\"\n * // \"second\"\n * ```\n *\n * Reschedule itself recursively\n * ```ts\n * import { queueScheduler } from 'rxjs';\n *\n * queueScheduler.schedule(function(state) {\n *   if (state !== 0) {\n *     console.log('before', state);\n *     this.schedule(state - 1); // `this` references currently executing Action,\n *                               // which we reschedule with new state\n *     console.log('after', state);\n *   }\n * }, 0, 3);\n *\n * // In scheduler that runs recursively, you would expect:\n * // \"before\", 3\n * // \"before\", 2\n * // \"before\", 1\n * // \"after\", 1\n * // \"after\", 2\n * // \"after\", 3\n *\n * // But with queue it logs:\n * // \"before\", 3\n * // \"after\", 3\n * // \"before\", 2\n * // \"after\", 2\n * // \"before\", 1\n * // \"after\", 1\n * ```\n */\n\nexport const queueScheduler = new QueueScheduler(QueueAction);\n\n/**\n * @deprecated Renamed to {@link queueScheduler}. Will be removed in v8.\n */\nexport const queue = queueScheduler;\n", "import { AsyncAction } from './AsyncAction';\nimport { AnimationFrameScheduler } from './AnimationFrameScheduler';\nimport { SchedulerAction } from '../types';\nimport { animationFrameProvider } from './animationFrameProvider';\nimport { TimerHandle } from './timerHandle';\n\nexport class AnimationFrameAction<T> extends AsyncAction<T> {\n  constructor(protected scheduler: AnimationFrameScheduler, protected work: (this: SchedulerAction<T>, state?: T) => void) {\n    super(scheduler, work);\n  }\n\n  protected requestAsyncId(scheduler: AnimationFrameScheduler, id?: TimerHandle, delay: number = 0): TimerHandle {\n    // If delay is greater than 0, request as an async action.\n    if (delay !== null && delay > 0) {\n      return super.requestAsyncId(scheduler, id, delay);\n    }\n    // Push the action to the end of the scheduler queue.\n    scheduler.actions.push(this);\n    // If an animation frame has already been requested, don't request another\n    // one. If an animation frame hasn't been requested yet, request one. Return\n    // the current animation frame request id.\n    return scheduler._scheduled || (scheduler._scheduled = animationFrameProvider.requestAnimationFrame(() => scheduler.flush(undefined)));\n  }\n\n  protected recycleAsyncId(scheduler: AnimationFrameScheduler, id?: TimerHandle, delay: number = 0): TimerHandle | undefined {\n    // If delay exists and is greater than 0, or if the delay is null (the\n    // action wasn't rescheduled) but was originally scheduled as an async\n    // action, then recycle as an async action.\n    if (delay != null ? delay > 0 : this.delay > 0) {\n      return super.recycleAsyncId(scheduler, id, delay);\n    }\n    // If the scheduler queue has no remaining actions with the same async id,\n    // cancel the requested animation frame and set the scheduled flag to\n    // undefined so the next AnimationFrameAction will request its own.\n    const { actions } = scheduler;\n    if (id != null && actions[actions.length - 1]?.id !== id) {\n      animationFrameProvider.cancelAnimationFrame(id as number);\n      scheduler._scheduled = undefined;\n    }\n    // Return undefined so the action knows to request a new async id if it's rescheduled.\n    return undefined;\n  }\n}\n", "import { AsyncAction } from './AsyncAction';\nimport { AsyncScheduler } from './AsyncScheduler';\n\nexport class AnimationFrameScheduler extends AsyncScheduler {\n  public flush(action?: AsyncAction<any>): void {\n    this._active = true;\n    // The async id that effects a call to flush is stored in _scheduled.\n    // Before executing an action, it's necessary to check the action's async\n    // id to determine whether it's supposed to be executed in the current\n    // flush.\n    // Previous implementations of this method used a count to determine this,\n    // but that was unsound, as actions that are unsubscribed - i.e. cancelled -\n    // are removed from the actions array and that can shift actions that are\n    // scheduled to be executed in a subsequent flush into positions at which\n    // they are executed within the current flush.\n    const flushId = this._scheduled;\n    this._scheduled = undefined;\n\n    const { actions } = this;\n    let error: any;\n    action = action || actions.shift()!;\n\n    do {\n      if ((error = action.execute(action.state, action.delay))) {\n        break;\n      }\n    } while ((action = actions[0]) && action.id === flushId && actions.shift());\n\n    this._active = false;\n\n    if (error) {\n      while ((action = actions[0]) && action.id === flushId && actions.shift()) {\n        action.unsubscribe();\n      }\n      throw error;\n    }\n  }\n}\n", "import { AnimationFrameAction } from './AnimationFrameAction';\nimport { AnimationFrameScheduler } from './AnimationFrameScheduler';\n\n/**\n *\n * Animation Frame Scheduler\n *\n * <span class=\"informal\">Perform task when `window.requestAnimationFrame` would fire</span>\n *\n * When `animationFrame` scheduler is used with delay, it will fall back to {@link asyncScheduler} scheduler\n * behaviour.\n *\n * Without delay, `animationFrame` scheduler can be used to create smooth browser animations.\n * It makes sure scheduled task will happen just before next browser content repaint,\n * thus performing animations as efficiently as possible.\n *\n * ## Example\n * Schedule div height animation\n * ```ts\n * // html: <div style=\"background: #0ff;\"></div>\n * import { animationFrameScheduler } from 'rxjs';\n *\n * const div = document.querySelector('div');\n *\n * animationFrameScheduler.schedule(function(height) {\n *   div.style.height = height + \"px\";\n *\n *   this.schedule(height + 1);  // `this` references currently executing Action,\n *                               // which we reschedule with new state\n * }, 0, 0);\n *\n * // You will see a div element growing in height\n * ```\n */\n\nexport const animationFrameScheduler = new AnimationFrameScheduler(AnimationFrameAction);\n\n/**\n * @deprecated Renamed to {@link animationFrameScheduler}. Will be removed in v8.\n */\nexport const animationFrame = animationFrameScheduler;\n", "import { Observable } from '../Observable';\nimport { SchedulerLike } from '../types';\n\n/**\n * A simple Observable that emits no items to the Observer and immediately\n * emits a complete notification.\n *\n * <span class=\"informal\">Just emits 'complete', and nothing else.</span>\n *\n * ![](empty.png)\n *\n * A simple Observable that only emits the complete notification. It can be used\n * for composing with other Observables, such as in a {@link mergeMap}.\n *\n * ## Examples\n *\n * Log complete notification\n *\n * ```ts\n * import { EMPTY } from 'rxjs';\n *\n * EMPTY.subscribe({\n *   next: () => console.log('Next'),\n *   complete: () => console.log('Complete!')\n * });\n *\n * // Outputs\n * // Complete!\n * ```\n *\n * Emit the number 7, then complete\n *\n * ```ts\n * import { EMPTY, startWith } from 'rxjs';\n *\n * const result = EMPTY.pipe(startWith(7));\n * result.subscribe(x => console.log(x));\n *\n * // Outputs\n * // 7\n * ```\n *\n * Map and flatten only odd numbers to the sequence `'a'`, `'b'`, `'c'`\n *\n * ```ts\n * import { interval, mergeMap, of, EMPTY } from 'rxjs';\n *\n * const interval$ = interval(1000);\n * const result = interval$.pipe(\n *   mergeMap(x => x % 2 === 1 ? of('a', 'b', 'c') : EMPTY),\n * );\n * result.subscribe(x => console.log(x));\n *\n * // Results in the following to the console:\n * // x is equal to the count on the interval, e.g. (0, 1, 2, 3, ...)\n * // x will occur every 1000ms\n * // if x % 2 is equal to 1, print a, b, c (each on its own)\n * // if x % 2 is not equal to 1, nothing will be output\n * ```\n *\n * @see {@link Observable}\n * @see {@link NEVER}\n * @see {@link of}\n * @see {@link throwError}\n */\nexport const EMPTY = new Observable<never>((subscriber) => subscriber.complete());\n\n/**\n * @param scheduler A {@link SchedulerLike} to use for scheduling\n * the emission of the complete notification.\n * @deprecated Replaced with the {@link EMPTY} constant or {@link scheduled} (e.g. `scheduled([], scheduler)`). Will be removed in v8.\n */\nexport function empty(scheduler?: SchedulerLike) {\n  return scheduler ? emptyScheduled(scheduler) : EMPTY;\n}\n\nfunction emptyScheduled(scheduler: SchedulerLike) {\n  return new Observable<never>((subscriber) => scheduler.schedule(() => subscriber.complete()));\n}\n", "import { SchedulerLike } from '../types';\nimport { isFunction } from './isFunction';\n\nexport function isScheduler(value: any): value is SchedulerLike {\n  return value && isFunction(value.schedule);\n}\n", "import { SchedulerLike } from '../types';\nimport { isFunction } from './isFunction';\nimport { isScheduler } from './isScheduler';\n\nfunction last<T>(arr: T[]): T | undefined {\n  return arr[arr.length - 1];\n}\n\nexport function popResultSelector(args: any[]): ((...args: unknown[]) => unknown) | undefined {\n  return isFunction(last(args)) ? args.pop() : undefined;\n}\n\nexport function popScheduler(args: any[]): SchedulerLike | undefined {\n  return isScheduler(last(args)) ? args.pop() : undefined;\n}\n\nexport function popNumber(args: any[], defaultValue: number): number {\n  return typeof last(args) === 'number' ? args.pop()! : defaultValue;\n}\n", "export const isArrayLike = (<T>(x: any): x is ArrayLike<T> => x && typeof x.length === 'number' && typeof x !== 'function');", "import { isFunction } from \"./isFunction\";\n\n/**\n * Tests to see if the object is \"thennable\".\n * @param value the object to test\n */\nexport function isPromise(value: any): value is PromiseLike<any> {\n  return isFunction(value?.then);\n}\n", "import { InteropObservable } from '../types';\nimport { observable as Symbol_observable } from '../symbol/observable';\nimport { isFunction } from './isFunction';\n\n/** Identifies an input as being Observable (but not necessary an Rx Observable) */\nexport function isInteropObservable(input: any): input is InteropObservable<any> {\n  return isFunction(input[Symbol_observable]);\n}\n", "import { isFunction } from './isFunction';\n\nexport function isAsyncIterable<T>(obj: any): obj is AsyncIterable<T> {\n  return Symbol.asyncIterator && isFunction(obj?.[Symbol.asyncIterator]);\n}\n", "/**\n * Creates the TypeError to throw if an invalid object is passed to `from` or `scheduled`.\n * @param input The object that was passed.\n */\nexport function createInvalidObservableTypeError(input: any) {\n  // TODO: We should create error codes that can be looked up, so this can be less verbose.\n  return new TypeError(\n    `You provided ${\n      input !== null && typeof input === 'object' ? 'an invalid object' : `'${input}'`\n    } where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.`\n  );\n}\n", "export function getSymbolIterator(): symbol {\n  if (typeof Symbol !== 'function' || !Symbol.iterator) {\n    return '@@iterator' as any;\n  }\n\n  return Symbol.iterator;\n}\n\nexport const iterator = getSymbolIterator();\n", "import { iterator as Symbol_iterator } from '../symbol/iterator';\nimport { isFunction } from './isFunction';\n\n/** Identifies an input as being an Iterable */\nexport function isIterable(input: any): input is Iterable<any> {\n  return isFunction(input?.[Symbol_iterator]);\n}\n", "import { ReadableStreamLike } from '../types';\nimport { isFunction } from './isFunction';\n\nexport async function* readableStreamLikeToAsyncGenerator<T>(readableStream: ReadableStreamLike<T>): AsyncGenerator<T> {\n  const reader = readableStream.getReader();\n  try {\n    while (true) {\n      const { value, done } = await reader.read();\n      if (done) {\n        return;\n      }\n      yield value!;\n    }\n  } finally {\n    reader.releaseLock();\n  }\n}\n\nexport function isReadableStreamLike<T>(obj: any): obj is ReadableStreamLike<T> {\n  // We don't want to use instanceof checks because they would return\n  // false for instances from another Realm, like an <iframe>.\n  return isFunction(obj?.getReader);\n}\n", "import { isArrayLike } from '../util/isArrayLike';\nimport { isPromise } from '../util/isPromise';\nimport { Observable } from '../Observable';\nimport { ObservableInput, ObservedValueOf, ReadableStreamLike } from '../types';\nimport { isInteropObservable } from '../util/isInteropObservable';\nimport { isAsyncIterable } from '../util/isAsyncIterable';\nimport { createInvalidObservableTypeError } from '../util/throwUnobservableError';\nimport { isIterable } from '../util/isIterable';\nimport { isReadableStreamLike, readableStreamLikeToAsyncGenerator } from '../util/isReadableStreamLike';\nimport { Subscriber } from '../Subscriber';\nimport { isFunction } from '../util/isFunction';\nimport { reportUnhandledError } from '../util/reportUnhandledError';\nimport { observable as Symbol_observable } from '../symbol/observable';\n\nexport function innerFrom<O extends ObservableInput<any>>(input: O): Observable<ObservedValueOf<O>>;\nexport function innerFrom<T>(input: ObservableInput<T>): Observable<T> {\n  if (input instanceof Observable) {\n    return input;\n  }\n  if (input != null) {\n    if (isInteropObservable(input)) {\n      return fromInteropObservable(input);\n    }\n    if (isArrayLike(input)) {\n      return fromArrayLike(input);\n    }\n    if (isPromise(input)) {\n      return fromPromise(input);\n    }\n    if (isAsyncIterable(input)) {\n      return fromAsyncIterable(input);\n    }\n    if (isIterable(input)) {\n      return fromIterable(input);\n    }\n    if (isReadableStreamLike(input)) {\n      return fromReadableStreamLike(input);\n    }\n  }\n\n  throw createInvalidObservableTypeError(input);\n}\n\n/**\n * Creates an RxJS Observable from an object that implements `Symbol.observable`.\n * @param obj An object that properly implements `Symbol.observable`.\n */\nexport function fromInteropObservable<T>(obj: any) {\n  return new Observable((subscriber: Subscriber<T>) => {\n    const obs = obj[Symbol_observable]();\n    if (isFunction(obs.subscribe)) {\n      return obs.subscribe(subscriber);\n    }\n    // Should be caught by observable subscribe function error handling.\n    throw new TypeError('Provided object does not correctly implement Symbol.observable');\n  });\n}\n\n/**\n * Synchronously emits the values of an array like and completes.\n * This is exported because there are creation functions and operators that need to\n * make direct use of the same logic, and there's no reason to make them run through\n * `from` conditionals because we *know* they're dealing with an array.\n * @param array The array to emit values from\n */\nexport function fromArrayLike<T>(array: ArrayLike<T>) {\n  return new Observable((subscriber: Subscriber<T>) => {\n    // Loop over the array and emit each value. Note two things here:\n    // 1. We're making sure that the subscriber is not closed on each loop.\n    //    This is so we don't continue looping over a very large array after\n    //    something like a `take`, `takeWhile`, or other synchronous unsubscription\n    //    has already unsubscribed.\n    // 2. In this form, reentrant code can alter that array we're looping over.\n    //    This is a known issue, but considered an edge case. The alternative would\n    //    be to copy the array before executing the loop, but this has\n    //    performance implications.\n    for (let i = 0; i < array.length && !subscriber.closed; i++) {\n      subscriber.next(array[i]);\n    }\n    subscriber.complete();\n  });\n}\n\nexport function fromPromise<T>(promise: PromiseLike<T>) {\n  return new Observable((subscriber: Subscriber<T>) => {\n    promise\n      .then(\n        (value) => {\n          if (!subscriber.closed) {\n            subscriber.next(value);\n            subscriber.complete();\n          }\n        },\n        (err: any) => subscriber.error(err)\n      )\n      .then(null, reportUnhandledError);\n  });\n}\n\nexport function fromIterable<T>(iterable: Iterable<T>) {\n  return new Observable((subscriber: Subscriber<T>) => {\n    for (const value of iterable) {\n      subscriber.next(value);\n      if (subscriber.closed) {\n        return;\n      }\n    }\n    subscriber.complete();\n  });\n}\n\nexport function fromAsyncIterable<T>(asyncIterable: AsyncIterable<T>) {\n  return new Observable((subscriber: Subscriber<T>) => {\n    process(asyncIterable, subscriber).catch((err) => subscriber.error(err));\n  });\n}\n\nexport function fromReadableStreamLike<T>(readableStream: ReadableStreamLike<T>) {\n  return fromAsyncIterable(readableStreamLikeToAsyncGenerator(readableStream));\n}\n\nasync function process<T>(asyncIterable: AsyncIterable<T>, subscriber: Subscriber<T>) {\n  for await (const value of asyncIterable) {\n    subscriber.next(value);\n    // A side-effect may have closed our subscriber,\n    // check before the next iteration.\n    if (subscriber.closed) {\n      return;\n    }\n  }\n  subscriber.complete();\n}\n", "import { Subscription } from '../Subscription';\nimport { SchedulerAction, SchedulerLike } from '../types';\n\nexport function executeSchedule(\n  parentSubscription: Subscription,\n  scheduler: SchedulerLike,\n  work: () => void,\n  delay: number,\n  repeat: true\n): void;\nexport function executeSchedule(\n  parentSubscription: Subscription,\n  scheduler: SchedulerLike,\n  work: () => void,\n  delay?: number,\n  repeat?: false\n): Subscription;\n\nexport function executeSchedule(\n  parentSubscription: Subscription,\n  scheduler: SchedulerLike,\n  work: () => void,\n  delay = 0,\n  repeat = false\n): Subscription | void {\n  const scheduleSubscription = scheduler.schedule(function (this: SchedulerAction<any>) {\n    work();\n    if (repeat) {\n      parentSubscription.add(this.schedule(null, delay));\n    } else {\n      this.unsubscribe();\n    }\n  }, delay);\n\n  parentSubscription.add(scheduleSubscription);\n\n  if (!repeat) {\n    // Because user-land scheduler implementations are unlikely to properly reuse\n    // Actions for repeat scheduling, we can't trust that the returned subscription\n    // will control repeat subscription scenarios. So we're trying to avoid using them\n    // incorrectly within this library.\n    return scheduleSubscription;\n  }\n}\n", "/** @prettier */\nimport { MonoTypeOperatorFunction, SchedulerLike } from '../types';\nimport { executeSchedule } from '../util/executeSchedule';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\n\n/**\n * Re-emits all notifications from source Observable with specified scheduler.\n *\n * <span class=\"informal\">Ensure a specific scheduler is used, from outside of an Observable.</span>\n *\n * `observeOn` is an operator that accepts a scheduler as a first parameter, which will be used to reschedule\n * notifications emitted by the source Observable. It might be useful, if you do not have control over\n * internal scheduler of a given Observable, but want to control when its values are emitted nevertheless.\n *\n * Returned Observable emits the same notifications (nexted values, complete and error events) as the source Observable,\n * but rescheduled with provided scheduler. Note that this doesn't mean that source Observables internal\n * scheduler will be replaced in any way. Original scheduler still will be used, but when the source Observable emits\n * notification, it will be immediately scheduled again - this time with scheduler passed to `observeOn`.\n * An anti-pattern would be calling `observeOn` on Observable that emits lots of values synchronously, to split\n * that emissions into asynchronous chunks. For this to happen, scheduler would have to be passed into the source\n * Observable directly (usually into the operator that creates it). `observeOn` simply delays notifications a\n * little bit more, to ensure that they are emitted at expected moments.\n *\n * As a matter of fact, `observeOn` accepts second parameter, which specifies in milliseconds with what delay notifications\n * will be emitted. The main difference between {@link delay} operator and `observeOn` is that `observeOn`\n * will delay all notifications - including error notifications - while `delay` will pass through error\n * from source Observable immediately when it is emitted. In general it is highly recommended to use `delay` operator\n * for any kind of delaying of values in the stream, while using `observeOn` to specify which scheduler should be used\n * for notification emissions in general.\n *\n * ## Example\n *\n * Ensure values in subscribe are called just before browser repaint\n *\n * ```ts\n * import { interval, observeOn, animationFrameScheduler } from 'rxjs';\n *\n * const someDiv = document.createElement('div');\n * someDiv.style.cssText = 'width: 200px;background: #09c';\n * document.body.appendChild(someDiv);\n * const intervals = interval(10);      // Intervals are scheduled\n *                                      // with async scheduler by default...\n * intervals.pipe(\n *   observeOn(animationFrameScheduler) // ...but we will observe on animationFrame\n * )                                    // scheduler to ensure smooth animation.\n * .subscribe(val => {\n *   someDiv.style.height = val + 'px';\n * });\n * ```\n *\n * @see {@link delay}\n *\n * @param scheduler Scheduler that will be used to reschedule notifications from source Observable.\n * @param delay Number of milliseconds that states with what delay every notification should be rescheduled.\n * @return A function that returns an Observable that emits the same\n * notifications as the source Observable, but with provided scheduler.\n */\nexport function observeOn<T>(scheduler: SchedulerLike, delay = 0): MonoTypeOperatorFunction<T> {\n  return operate((source, subscriber) => {\n    source.subscribe(\n      createOperatorSubscriber(\n        subscriber,\n        (value) => executeSchedule(subscriber, scheduler, () => subscriber.next(value), delay),\n        () => executeSchedule(subscriber, scheduler, () => subscriber.complete(), delay),\n        (err) => executeSchedule(subscriber, scheduler, () => subscriber.error(err), delay)\n      )\n    );\n  });\n}\n", "import { MonoTypeOperatorFunction, SchedulerLike } from '../types';\nimport { operate } from '../util/lift';\n\n/**\n * Asynchronously subscribes Observers to this Observable on the specified {@link SchedulerLike}.\n *\n * With `subscribeOn` you can decide what type of scheduler a specific Observable will be using when it is subscribed to.\n *\n * Schedulers control the speed and order of emissions to observers from an Observable stream.\n *\n * ![](subscribeOn.png)\n *\n * ## Example\n *\n * Given the following code:\n *\n * ```ts\n * import { of, merge } from 'rxjs';\n *\n * const a = of(1, 2, 3);\n * const b = of(4, 5, 6);\n *\n * merge(a, b).subscribe(console.log);\n *\n * // Outputs\n * // 1\n * // 2\n * // 3\n * // 4\n * // 5\n * // 6\n * ```\n *\n * Both Observable `a` and `b` will emit their values directly and synchronously once they are subscribed to.\n *\n * If we instead use the `subscribeOn` operator declaring that we want to use the {@link asyncScheduler} for values emitted by Observable `a`:\n *\n * ```ts\n * import { of, subscribeOn, asyncScheduler, merge } from 'rxjs';\n *\n * const a = of(1, 2, 3).pipe(subscribeOn(asyncScheduler));\n * const b = of(4, 5, 6);\n *\n * merge(a, b).subscribe(console.log);\n *\n * // Outputs\n * // 4\n * // 5\n * // 6\n * // 1\n * // 2\n * // 3\n * ```\n *\n * The reason for this is that Observable `b` emits its values directly and synchronously like before\n * but the emissions from `a` are scheduled on the event loop because we are now using the {@link asyncScheduler} for that specific Observable.\n *\n * @param scheduler The {@link SchedulerLike} to perform subscription actions on.\n * @param delay A delay to pass to the scheduler to delay subscriptions\n * @return A function that returns an Observable modified so that its\n * subscriptions happen on the specified {@link SchedulerLike}.\n */\nexport function subscribeOn<T>(scheduler: SchedulerLike, delay: number = 0): MonoTypeOperatorFunction<T> {\n  return operate((source, subscriber) => {\n    subscriber.add(scheduler.schedule(() => source.subscribe(subscriber), delay));\n  });\n}\n", "import { innerFrom } from '../observable/innerFrom';\nimport { observeOn } from '../operators/observeOn';\nimport { subscribeOn } from '../operators/subscribeOn';\nimport { InteropObservable, SchedulerLike } from '../types';\n\nexport function scheduleObservable<T>(input: InteropObservable<T>, scheduler: SchedulerLike) {\n  return innerFrom(input).pipe(subscribeOn(scheduler), observeOn(scheduler));\n}\n", "import { innerFrom } from '../observable/innerFrom';\nimport { observeOn } from '../operators/observeOn';\nimport { subscribeOn } from '../operators/subscribeOn';\nimport { SchedulerLike } from '../types';\n\nexport function schedulePromise<T>(input: PromiseLike<T>, scheduler: SchedulerLike) {\n  return innerFrom(input).pipe(subscribeOn(scheduler), observeOn(scheduler));\n}\n", "import { Observable } from '../Observable';\nimport { SchedulerLike } from '../types';\n\nexport function scheduleArray<T>(input: ArrayLike<T>, scheduler: SchedulerLike) {\n  return new Observable<T>((subscriber) => {\n    // The current array index.\n    let i = 0;\n    // Start iterating over the array like on a schedule.\n    return scheduler.schedule(function () {\n      if (i === input.length) {\n        // If we have hit the end of the array like in the\n        // previous job, we can complete.\n        subscriber.complete();\n      } else {\n        // Otherwise let's next the value at the current index,\n        // then increment our index.\n        subscriber.next(input[i++]);\n        // If the last emission didn't cause us to close the subscriber\n        // (via take or some side effect), reschedule the job and we'll\n        // make another pass.\n        if (!subscriber.closed) {\n          this.schedule();\n        }\n      }\n    });\n  });\n}\n", "import { Observable } from '../Observable';\nimport { SchedulerLike } from '../types';\nimport { iterator as Symbol_iterator } from '../symbol/iterator';\nimport { isFunction } from '../util/isFunction';\nimport { executeSchedule } from '../util/executeSchedule';\n\n/**\n * Used in {@link scheduled} to create an observable from an Iterable.\n * @param input The iterable to create an observable from\n * @param scheduler The scheduler to use\n */\nexport function scheduleIterable<T>(input: Iterable<T>, scheduler: SchedulerLike) {\n  return new Observable<T>((subscriber) => {\n    let iterator: Iterator<T, T>;\n\n    // Schedule the initial creation of the iterator from\n    // the iterable. This is so the code in the iterable is\n    // not called until the scheduled job fires.\n    executeSchedule(subscriber, scheduler, () => {\n      // Create the iterator.\n      iterator = (input as any)[Symbol_iterator]();\n\n      executeSchedule(\n        subscriber,\n        scheduler,\n        () => {\n          let value: T;\n          let done: boolean | undefined;\n          try {\n            // Pull the value out of the iterator\n            ({ value, done } = iterator.next());\n          } catch (err) {\n            // We got an error while pulling from the iterator\n            subscriber.error(err);\n            return;\n          }\n\n          if (done) {\n            // If it is \"done\" we just complete. This mimics the\n            // behavior of JavaScript's `for..of` consumption of\n            // iterables, which will not emit the value from an iterator\n            // result of `{ done: true: value: 'here' }`.\n            subscriber.complete();\n          } else {\n            // The iterable is not done, emit the value.\n            subscriber.next(value);\n          }\n        },\n        0,\n        true\n      );\n    });\n\n    // During finalization, if we see this iterator has a `return` method,\n    // then we know it is a Generator, and not just an Iterator. So we call\n    // the `return()` function. This will ensure that any `finally { }` blocks\n    // inside of the generator we can hit will be hit properly.\n    return () => isFunction(iterator?.return) && iterator.return();\n  });\n}\n", "import { SchedulerLike } from '../types';\nimport { Observable } from '../Observable';\nimport { executeSchedule } from '../util/executeSchedule';\n\nexport function scheduleAsyncIterable<T>(input: AsyncIterable<T>, scheduler: SchedulerLike) {\n  if (!input) {\n    throw new Error('Iterable cannot be null');\n  }\n  return new Observable<T>((subscriber) => {\n    executeSchedule(subscriber, scheduler, () => {\n      const iterator = input[Symbol.asyncIterator]();\n      executeSchedule(\n        subscriber,\n        scheduler,\n        () => {\n          iterator.next().then((result) => {\n            if (result.done) {\n              // This will remove the subscriptions from\n              // the parent subscription.\n              subscriber.complete();\n            } else {\n              subscriber.next(result.value);\n            }\n          });\n        },\n        0,\n        true\n      );\n    });\n  });\n}\n", "import { SchedulerLike, ReadableStreamLike } from '../types';\nimport { Observable } from '../Observable';\nimport { scheduleAsyncIterable } from './scheduleAsyncIterable';\nimport { readableStreamLikeToAsyncGenerator } from '../util/isReadableStreamLike';\n\nexport function scheduleReadableStreamLike<T>(input: ReadableStreamLike<T>, scheduler: SchedulerLike): Observable<T> {\n  return scheduleAsyncIterable(readableStreamLikeToAsyncGenerator(input), scheduler);\n}\n", "import { scheduleObservable } from './scheduleObservable';\nimport { schedulePromise } from './schedulePromise';\nimport { scheduleArray } from './scheduleArray';\nimport { scheduleIterable } from './scheduleIterable';\nimport { scheduleAsyncIterable } from './scheduleAsyncIterable';\nimport { isInteropObservable } from '../util/isInteropObservable';\nimport { isPromise } from '../util/isPromise';\nimport { isArrayLike } from '../util/isArrayLike';\nimport { isIterable } from '../util/isIterable';\nimport { ObservableInput, SchedulerLike } from '../types';\nimport { Observable } from '../Observable';\nimport { isAsyncIterable } from '../util/isAsyncIterable';\nimport { createInvalidObservableTypeError } from '../util/throwUnobservableError';\nimport { isReadableStreamLike } from '../util/isReadableStreamLike';\nimport { scheduleReadableStreamLike } from './scheduleReadableStreamLike';\n\n/**\n * Converts from a common {@link ObservableInput} type to an observable where subscription and emissions\n * are scheduled on the provided scheduler.\n *\n * @see {@link from}\n * @see {@link of}\n *\n * @param input The observable, array, promise, iterable, etc you would like to schedule\n * @param scheduler The scheduler to use to schedule the subscription and emissions from\n * the returned observable.\n */\nexport function scheduled<T>(input: ObservableInput<T>, scheduler: SchedulerLike): Observable<T> {\n  if (input != null) {\n    if (isInteropObservable(input)) {\n      return scheduleObservable(input, scheduler);\n    }\n    if (isArrayLike(input)) {\n      return scheduleArray(input, scheduler);\n    }\n    if (isPromise(input)) {\n      return schedulePromise(input, scheduler);\n    }\n    if (isAsyncIterable(input)) {\n      return scheduleAsyncIterable(input, scheduler);\n    }\n    if (isIterable(input)) {\n      return scheduleIterable(input, scheduler);\n    }\n    if (isReadableStreamLike(input)) {\n      return scheduleReadableStreamLike(input, scheduler);\n    }\n  }\n  throw createInvalidObservableTypeError(input);\n}\n", "import { Observable } from '../Observable';\nimport { ObservableInput, SchedulerLike, ObservedValueOf } from '../types';\nimport { scheduled } from '../scheduled/scheduled';\nimport { innerFrom } from './innerFrom';\n\nexport function from<O extends ObservableInput<any>>(input: O): Observable<ObservedValueOf<O>>;\n/** @deprecated The `scheduler` parameter will be removed in v8. Use `scheduled`. Details: https://rxjs.dev/deprecations/scheduler-argument */\nexport function from<O extends ObservableInput<any>>(input: O, scheduler: SchedulerLike | undefined): Observable<ObservedValueOf<O>>;\n\n/**\n * Creates an Observable from an Array, an array-like object, a Promise, an iterable object, or an Observable-like object.\n *\n * <span class=\"informal\">Converts almost anything to an Observable.</span>\n *\n * ![](from.png)\n *\n * `from` converts various other objects and data types into Observables. It also converts a Promise, an array-like, or an\n * <a href=\"https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Iteration_protocols#iterable\" target=\"_blank\">iterable</a>\n * object into an Observable that emits the items in that promise, array, or iterable. A String, in this context, is treated\n * as an array of characters. Observable-like objects (contains a function named with the ES2015 Symbol for Observable) can also be\n * converted through this operator.\n *\n * ## Examples\n *\n * Converts an array to an Observable\n *\n * ```ts\n * import { from } from 'rxjs';\n *\n * const array = [10, 20, 30];\n * const result = from(array);\n *\n * result.subscribe(x => console.log(x));\n *\n * // Logs:\n * // 10\n * // 20\n * // 30\n * ```\n *\n * Convert an infinite iterable (from a generator) to an Observable\n *\n * ```ts\n * import { from, take } from 'rxjs';\n *\n * function* generateDoubles(seed) {\n *    let i = seed;\n *    while (true) {\n *      yield i;\n *      i = 2 * i; // double it\n *    }\n * }\n *\n * const iterator = generateDoubles(3);\n * const result = from(iterator).pipe(take(10));\n *\n * result.subscribe(x => console.log(x));\n *\n * // Logs:\n * // 3\n * // 6\n * // 12\n * // 24\n * // 48\n * // 96\n * // 192\n * // 384\n * // 768\n * // 1536\n * ```\n *\n * With `asyncScheduler`\n *\n * ```ts\n * import { from, asyncScheduler } from 'rxjs';\n *\n * console.log('start');\n *\n * const array = [10, 20, 30];\n * const result = from(array, asyncScheduler);\n *\n * result.subscribe(x => console.log(x));\n *\n * console.log('end');\n *\n * // Logs:\n * // 'start'\n * // 'end'\n * // 10\n * // 20\n * // 30\n * ```\n *\n * @see {@link fromEvent}\n * @see {@link fromEventPattern}\n *\n * @param {ObservableInput<T>} A subscription object, a Promise, an Observable-like,\n * an Array, an iterable, or an array-like object to be converted.\n * @param {SchedulerLike} An optional {@link SchedulerLike} on which to schedule the emission of values.\n * @return {Observable<T>}\n */\nexport function from<T>(input: ObservableInput<T>, scheduler?: SchedulerLike): Observable<T> {\n  return scheduler ? scheduled(input, scheduler) : innerFrom(input);\n}\n", "import { SchedulerLike, ValueFromArray } from '../types';\nimport { Observable } from '../Observable';\nimport { popScheduler } from '../util/args';\nimport { from } from './from';\n\n// Devs are more likely to pass null or undefined than they are a scheduler\n// without accompanying values. To make things easier for (naughty) devs who\n// use the `strictNullChecks: false` TypeScript compiler option, these\n// overloads with explicit null and undefined values are included.\n\nexport function of(value: null): Observable<null>;\nexport function of(value: undefined): Observable<undefined>;\n\n/** @deprecated The `scheduler` parameter will be removed in v8. Use `scheduled`. Details: https://rxjs.dev/deprecations/scheduler-argument */\nexport function of(scheduler: SchedulerLike): Observable<never>;\n/** @deprecated The `scheduler` parameter will be removed in v8. Use `scheduled`. Details: https://rxjs.dev/deprecations/scheduler-argument */\nexport function of<A extends readonly unknown[]>(...valuesAndScheduler: [...A, SchedulerLike]): Observable<ValueFromArray<A>>;\n\nexport function of(): Observable<never>;\n/** @deprecated Do not specify explicit type parameters. Signatures with type parameters that cannot be inferred will be removed in v8. */\nexport function of<T>(): Observable<T>;\nexport function of<T>(value: T): Observable<T>;\nexport function of<A extends readonly unknown[]>(...values: A): Observable<ValueFromArray<A>>;\n\n/**\n * Converts the arguments to an observable sequence.\n *\n * <span class=\"informal\">Each argument becomes a `next` notification.</span>\n *\n * ![](of.png)\n *\n * Unlike {@link from}, it does not do any flattening and emits each argument in whole\n * as a separate `next` notification.\n *\n * ## Examples\n *\n * Emit the values `10, 20, 30`\n *\n * ```ts\n * import { of } from 'rxjs';\n *\n * of(10, 20, 30)\n *   .subscribe({\n *     next: value => console.log('next:', value),\n *     error: err => console.log('error:', err),\n *     complete: () => console.log('the end'),\n *   });\n *\n * // Outputs\n * // next: 10\n * // next: 20\n * // next: 30\n * // the end\n * ```\n *\n * Emit the array `[1, 2, 3]`\n *\n * ```ts\n * import { of } from 'rxjs';\n *\n * of([1, 2, 3])\n *   .subscribe({\n *     next: value => console.log('next:', value),\n *     error: err => console.log('error:', err),\n *     complete: () => console.log('the end'),\n *   });\n *\n * // Outputs\n * // next: [1, 2, 3]\n * // the end\n * ```\n *\n * @see {@link from}\n * @see {@link range}\n *\n * @param {...T} values A comma separated list of arguments you want to be emitted\n * @return {Observable} An Observable that emits the arguments\n * described above and then completes.\n */\nexport function of<T>(...args: Array<T | SchedulerLike>): Observable<T> {\n  const scheduler = popScheduler(args);\n  return from(args as T[], scheduler);\n}\n", "import { Observable } from '../Observable';\nimport { Subscriber } from '../Subscriber';\nimport { SchedulerLike } from '../types';\nimport { isFunction } from '../util/isFunction';\n\n/**\n * Creates an observable that will create an error instance and push it to the consumer as an error\n * immediately upon subscription.\n *\n * <span class=\"informal\">Just errors and does nothing else</span>\n *\n * ![](throw.png)\n *\n * This creation function is useful for creating an observable that will create an error and error every\n * time it is subscribed to. Generally, inside of most operators when you might want to return an errored\n * observable, this is unnecessary. In most cases, such as in the inner return of {@link concatMap},\n * {@link mergeMap}, {@link defer}, and many others, you can simply throw the error, and RxJS will pick\n * that up and notify the consumer of the error.\n *\n * ## Example\n *\n * Create a simple observable that will create a new error with a timestamp and log it\n * and the message every time you subscribe to it\n *\n * ```ts\n * import { throwError } from 'rxjs';\n *\n * let errorCount = 0;\n *\n * const errorWithTimestamp$ = throwError(() => {\n *   const error: any = new Error(`This is error number ${ ++errorCount }`);\n *   error.timestamp = Date.now();\n *   return error;\n * });\n *\n * errorWithTimestamp$.subscribe({\n *   error: err => console.log(err.timestamp, err.message)\n * });\n *\n * errorWithTimestamp$.subscribe({\n *   error: err => console.log(err.timestamp, err.message)\n * });\n *\n * // Logs the timestamp and a new error message for each subscription\n * ```\n *\n * ### Unnecessary usage\n *\n * Using `throwError` inside of an operator or creation function\n * with a callback, is usually not necessary\n *\n * ```ts\n * import { of, concatMap, timer, throwError } from 'rxjs';\n *\n * const delays$ = of(1000, 2000, Infinity, 3000);\n *\n * delays$.pipe(\n *   concatMap(ms => {\n *     if (ms < 10000) {\n *       return timer(ms);\n *     } else {\n *       // This is probably overkill.\n *       return throwError(() => new Error(`Invalid time ${ ms }`));\n *     }\n *   })\n * )\n * .subscribe({\n *   next: console.log,\n *   error: console.error\n * });\n * ```\n *\n * You can just throw the error instead\n *\n * ```ts\n * import { of, concatMap, timer } from 'rxjs';\n *\n * const delays$ = of(1000, 2000, Infinity, 3000);\n *\n * delays$.pipe(\n *   concatMap(ms => {\n *     if (ms < 10000) {\n *       return timer(ms);\n *     } else {\n *       // Cleaner and easier to read for most folks.\n *       throw new Error(`Invalid time ${ ms }`);\n *     }\n *   })\n * )\n * .subscribe({\n *   next: console.log,\n *   error: console.error\n * });\n * ```\n *\n * @param errorFactory A factory function that will create the error instance that is pushed.\n */\nexport function throwError(errorFactory: () => any): Observable<never>;\n\n/**\n * Returns an observable that will error with the specified error immediately upon subscription.\n *\n * @param error The error instance to emit\n * @deprecated Support for passing an error value will be removed in v8. Instead, pass a factory function to `throwError(() => new Error('test'))`. This is\n * because it will create the error at the moment it should be created and capture a more appropriate stack trace. If\n * for some reason you need to create the error ahead of time, you can still do that: `const err = new Error('test'); throwError(() => err);`.\n */\nexport function throwError(error: any): Observable<never>;\n\n/**\n * Notifies the consumer of an error using a given scheduler by scheduling it at delay `0` upon subscription.\n *\n * @param errorOrErrorFactory An error instance or error factory\n * @param scheduler A scheduler to use to schedule the error notification\n * @deprecated The `scheduler` parameter will be removed in v8.\n * Use `throwError` in combination with {@link observeOn}: `throwError(() => new Error('test')).pipe(observeOn(scheduler));`.\n * Details: https://rxjs.dev/deprecations/scheduler-argument\n */\nexport function throwError(errorOrErrorFactory: any, scheduler: SchedulerLike): Observable<never>;\n\nexport function throwError(errorOrErrorFactory: any, scheduler?: SchedulerLike): Observable<never> {\n  const errorFactory = isFunction(errorOrErrorFactory) ? errorOrErrorFactory : () => errorOrErrorFactory;\n  const init = (subscriber: Subscriber<never>) => subscriber.error(errorFactory());\n  return new Observable(scheduler ? (subscriber) => scheduler.schedule(init as any, 0, subscriber) : init);\n}\n", "import { createErrorClass } from './createErrorClass';\n\nexport interface EmptyError extends Error {}\n\nexport interface EmptyErrorCtor {\n  /**\n   * @deprecated Internal implementation detail. Do not construct error instances.\n   * Cannot be tagged as internal: https://github.com/ReactiveX/rxjs/issues/6269\n   */\n  new (): EmptyError;\n}\n\n/**\n * An error thrown when an Observable or a sequence was queried but has no\n * elements.\n *\n * @see {@link first}\n * @see {@link last}\n * @see {@link single}\n * @see {@link firstValueFrom}\n * @see {@link lastValueFrom}\n *\n * @class EmptyError\n */\nexport const EmptyError: EmptyErrorCtor = createErrorClass((_super) => function EmptyErrorImpl(this: any) {\n  _super(this);\n  this.name = 'EmptyError';\n  this.message = 'no elements in sequence';\n});\n", "/**\n * Checks to see if a value is not only a `Date` object,\n * but a *valid* `Date` object that can be converted to a\n * number. For example, `new Date('blah')` is indeed an\n * `instanceof Date`, however it cannot be converted to a\n * number.\n */\nexport function isValidDate(value: any): value is Date {\n  return value instanceof Date && !isNaN(value as any);\n}\n", "import { OperatorFunction } from '../types';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\n\nexport function map<T, R>(project: (value: T, index: number) => R): OperatorFunction<T, R>;\n/** @deprecated Use a closure instead of a `thisArg`. Signatures accepting a `thisArg` will be removed in v8. */\nexport function map<T, R, A>(project: (this: A, value: T, index: number) => R, thisArg: A): OperatorFunction<T, R>;\n\n/**\n * Applies a given `project` function to each value emitted by the source\n * Observable, and emits the resulting values as an Observable.\n *\n * <span class=\"informal\">Like [Array.prototype.map()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/map),\n * it passes each source value through a transformation function to get\n * corresponding output values.</span>\n *\n * ![](map.png)\n *\n * Similar to the well known `Array.prototype.map` function, this operator\n * applies a projection to each value and emits that projection in the output\n * Observable.\n *\n * ## Example\n *\n * Map every click to the `clientX` position of that click\n *\n * ```ts\n * import { fromEvent, map } from 'rxjs';\n *\n * const clicks = fromEvent<PointerEvent>(document, 'click');\n * const positions = clicks.pipe(map(ev => ev.clientX));\n *\n * positions.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link mapTo}\n * @see {@link pluck}\n *\n * @param {function(value: T, index: number): R} project The function to apply\n * to each `value` emitted by the source Observable. The `index` parameter is\n * the number `i` for the i-th emission that has happened since the\n * subscription, starting from the number `0`.\n * @param {any} [thisArg] An optional argument to define what `this` is in the\n * `project` function.\n * @return A function that returns an Observable that emits the values from the\n * source Observable transformed by the given `project` function.\n */\nexport function map<T, R>(project: (value: T, index: number) => R, thisArg?: any): OperatorFunction<T, R> {\n  return operate((source, subscriber) => {\n    // The index of the value from the source. Used with projection.\n    let index = 0;\n    // Subscribe to the source, all errors and completions are sent along\n    // to the consumer.\n    source.subscribe(\n      createOperatorSubscriber(subscriber, (value: T) => {\n        // Call the projection function with the appropriate this context,\n        // and send the resulting value to the consumer.\n        subscriber.next(project.call(thisArg, value, index++));\n      })\n    );\n  });\n}\n", "import { OperatorFunction } from \"../types\";\nimport { map } from \"../operators/map\";\n\nconst { isArray } = Array;\n\nfunction callOrApply<T, R>(fn: ((...values: T[]) => R), args: T|T[]): R {\n    return isArray(args) ? fn(...args) : fn(args);\n}\n\n/**\n * Used in several -- mostly deprecated -- situations where we need to \n * apply a list of arguments or a single argument to a result selector.\n */\nexport function mapOneOrManyArgs<T, R>(fn: ((...values: T[]) => R)): OperatorFunction<T|T[], R> {\n    return map(args => callOrApply(fn, args))\n}", "const { isArray } = Array;\nconst { getPrototypeOf, prototype: objectProto, keys: getKeys } = Object;\n\n/**\n * Used in functions where either a list of arguments, a single array of arguments, or a\n * dictionary of arguments can be returned. Returns an object with an `args` property with\n * the arguments in an array, if it is a dictionary, it will also return the `keys` in another\n * property.\n */\nexport function argsArgArrayOrObject<T, O extends Record<string, T>>(args: T[] | [O] | [T[]]): { args: T[]; keys: string[] | null } {\n  if (args.length === 1) {\n    const first = args[0];\n    if (isArray(first)) {\n      return { args: first, keys: null };\n    }\n    if (isPOJO(first)) {\n      const keys = getKeys(first);\n      return {\n        args: keys.map((key) => first[key]),\n        keys,\n      };\n    }\n  }\n\n  return { args: args as T[], keys: null };\n}\n\nfunction isPOJO(obj: any): obj is object {\n  return obj && typeof obj === 'object' && getPrototypeOf(obj) === objectProto;\n}\n", "export function createObject(keys: string[], values: any[]) {\n  return keys.reduce((result, key, i) => ((result[key] = values[i]), result), {} as any);\n}\n", "import { Observable } from '../Observable';\nimport { ObservableInput, SchedulerLike, ObservedValueOf, ObservableInputTuple } from '../types';\nimport { argsArgArrayOrObject } from '../util/argsArgArrayOrObject';\nimport { Subscriber } from '../Subscriber';\nimport { from } from './from';\nimport { identity } from '../util/identity';\nimport { Subscription } from '../Subscription';\nimport { mapOneOrManyArgs } from '../util/mapOneOrManyArgs';\nimport { popResultSelector, popScheduler } from '../util/args';\nimport { createObject } from '../util/createObject';\nimport { createOperatorSubscriber } from '../operators/OperatorSubscriber';\nimport { AnyCatcher } from '../AnyCatcher';\nimport { executeSchedule } from '../util/executeSchedule';\n\n// combineLatest(any)\n// We put this first because we need to catch cases where the user has supplied\n// _exactly `any`_ as the argument. Since `any` literally matches _anything_,\n// we don't want it to randomly hit one of the other type signatures below,\n// as we have no idea at build-time what type we should be returning when given an any.\n\n/**\n * You have passed `any` here, we can't figure out if it is\n * an array or an object, so you're getting `unknown`. Use better types.\n * @param arg Something typed as `any`\n */\nexport function combineLatest<T extends AnyCatcher>(arg: T): Observable<unknown>;\n\n// combineLatest([a, b, c])\nexport function combineLatest(sources: []): Observable<never>;\nexport function combineLatest<A extends readonly unknown[]>(sources: readonly [...ObservableInputTuple<A>]): Observable<A>;\n/** @deprecated The `scheduler` parameter will be removed in v8. Use `scheduled` and `combineLatestAll`. Details: https://rxjs.dev/deprecations/scheduler-argument */\nexport function combineLatest<A extends readonly unknown[], R>(\n  sources: readonly [...ObservableInputTuple<A>],\n  resultSelector: (...values: A) => R,\n  scheduler: SchedulerLike\n): Observable<R>;\nexport function combineLatest<A extends readonly unknown[], R>(\n  sources: readonly [...ObservableInputTuple<A>],\n  resultSelector: (...values: A) => R\n): Observable<R>;\n/** @deprecated The `scheduler` parameter will be removed in v8. Use `scheduled` and `combineLatestAll`. Details: https://rxjs.dev/deprecations/scheduler-argument */\nexport function combineLatest<A extends readonly unknown[]>(\n  sources: readonly [...ObservableInputTuple<A>],\n  scheduler: SchedulerLike\n): Observable<A>;\n\n// combineLatest(a, b, c)\n/** @deprecated Pass an array of sources instead. The rest-parameters signature will be removed in v8. Details: https://rxjs.dev/deprecations/array-argument */\nexport function combineLatest<A extends readonly unknown[]>(...sources: [...ObservableInputTuple<A>]): Observable<A>;\n/** @deprecated The `scheduler` parameter will be removed in v8. Use `scheduled` and `combineLatestAll`. Details: https://rxjs.dev/deprecations/scheduler-argument */\nexport function combineLatest<A extends readonly unknown[], R>(\n  ...sourcesAndResultSelectorAndScheduler: [...ObservableInputTuple<A>, (...values: A) => R, SchedulerLike]\n): Observable<R>;\n/** @deprecated Pass an array of sources instead. The rest-parameters signature will be removed in v8. Details: https://rxjs.dev/deprecations/array-argument */\nexport function combineLatest<A extends readonly unknown[], R>(\n  ...sourcesAndResultSelector: [...ObservableInputTuple<A>, (...values: A) => R]\n): Observable<R>;\n/** @deprecated The `scheduler` parameter will be removed in v8. Use `scheduled` and `combineLatestAll`. Details: https://rxjs.dev/deprecations/scheduler-argument */\nexport function combineLatest<A extends readonly unknown[]>(\n  ...sourcesAndScheduler: [...ObservableInputTuple<A>, SchedulerLike]\n): Observable<A>;\n\n// combineLatest({a, b, c})\nexport function combineLatest(sourcesObject: { [K in any]: never }): Observable<never>;\nexport function combineLatest<T extends Record<string, ObservableInput<any>>>(\n  sourcesObject: T\n): Observable<{ [K in keyof T]: ObservedValueOf<T[K]> }>;\n\n/**\n * Combines multiple Observables to create an Observable whose values are\n * calculated from the latest values of each of its input Observables.\n *\n * <span class=\"informal\">Whenever any input Observable emits a value, it\n * computes a formula using the latest values from all the inputs, then emits\n * the output of that formula.</span>\n *\n * ![](combineLatest.png)\n *\n * `combineLatest` combines the values from all the Observables passed in the\n * observables array. This is done by subscribing to each Observable in order and,\n * whenever any Observable emits, collecting an array of the most recent\n * values from each Observable. So if you pass `n` Observables to this operator,\n * the returned Observable will always emit an array of `n` values, in an order\n * corresponding to the order of the passed Observables (the value from the first Observable\n * will be at index 0 of the array and so on).\n *\n * Static version of `combineLatest` accepts an array of Observables. Note that an array of\n * Observables is a good choice, if you don't know beforehand how many Observables\n * you will combine. Passing an empty array will result in an Observable that\n * completes immediately.\n *\n * To ensure the output array always has the same length, `combineLatest` will\n * actually wait for all input Observables to emit at least once,\n * before it starts emitting results. This means if some Observable emits\n * values before other Observables started emitting, all these values but the last\n * will be lost. On the other hand, if some Observable does not emit a value but\n * completes, resulting Observable will complete at the same moment without\n * emitting anything, since it will now be impossible to include a value from the\n * completed Observable in the resulting array. Also, if some input Observable does\n * not emit any value and never completes, `combineLatest` will also never emit\n * and never complete, since, again, it will wait for all streams to emit some\n * value.\n *\n * If at least one Observable was passed to `combineLatest` and all passed Observables\n * emitted something, the resulting Observable will complete when all combined\n * streams complete. So even if some Observable completes, the result of\n * `combineLatest` will still emit values when other Observables do. In case\n * of a completed Observable, its value from now on will always be the last\n * emitted value. On the other hand, if any Observable errors, `combineLatest`\n * will error immediately as well, and all other Observables will be unsubscribed.\n *\n * ## Examples\n *\n * Combine two timer Observables\n *\n * ```ts\n * import { timer, combineLatest } from 'rxjs';\n *\n * const firstTimer = timer(0, 1000); // emit 0, 1, 2... after every second, starting from now\n * const secondTimer = timer(500, 1000); // emit 0, 1, 2... after every second, starting 0,5s from now\n * const combinedTimers = combineLatest([firstTimer, secondTimer]);\n * combinedTimers.subscribe(value => console.log(value));\n * // Logs\n * // [0, 0] after 0.5s\n * // [1, 0] after 1s\n * // [1, 1] after 1.5s\n * // [2, 1] after 2s\n * ```\n *\n * Combine a dictionary of Observables\n *\n * ```ts\n * import { of, delay, startWith, combineLatest } from 'rxjs';\n *\n * const observables = {\n *   a: of(1).pipe(delay(1000), startWith(0)),\n *   b: of(5).pipe(delay(5000), startWith(0)),\n *   c: of(10).pipe(delay(10000), startWith(0))\n * };\n * const combined = combineLatest(observables);\n * combined.subscribe(value => console.log(value));\n * // Logs\n * // { a: 0, b: 0, c: 0 } immediately\n * // { a: 1, b: 0, c: 0 } after 1s\n * // { a: 1, b: 5, c: 0 } after 5s\n * // { a: 1, b: 5, c: 10 } after 10s\n * ```\n *\n * Combine an array of Observables\n *\n * ```ts\n * import { of, delay, startWith, combineLatest } from 'rxjs';\n *\n * const observables = [1, 5, 10].map(\n *   n => of(n).pipe(\n *     delay(n * 1000), // emit 0 and then emit n after n seconds\n *     startWith(0)\n *   )\n * );\n * const combined = combineLatest(observables);\n * combined.subscribe(value => console.log(value));\n * // Logs\n * // [0, 0, 0] immediately\n * // [1, 0, 0] after 1s\n * // [1, 5, 0] after 5s\n * // [1, 5, 10] after 10s\n * ```\n *\n * Use map operator to dynamically calculate the Body-Mass Index\n *\n * ```ts\n * import { of, combineLatest, map } from 'rxjs';\n *\n * const weight = of(70, 72, 76, 79, 75);\n * const height = of(1.76, 1.77, 1.78);\n * const bmi = combineLatest([weight, height]).pipe(\n *   map(([w, h]) => w / (h * h)),\n * );\n * bmi.subscribe(x => console.log('BMI is ' + x));\n *\n * // With output to console:\n * // BMI is 24.212293388429753\n * // BMI is 23.93948099205209\n * // BMI is 23.671253629592222\n * ```\n *\n * @see {@link combineLatestAll}\n * @see {@link merge}\n * @see {@link withLatestFrom}\n *\n * @param {ObservableInput} [observables] An array of input Observables to combine with each other.\n * An array of Observables must be given as the first argument.\n * @param {function} [project] An optional function to project the values from\n * the combined latest values into a new value on the output Observable.\n * @param {SchedulerLike} [scheduler=null] The {@link SchedulerLike} to use for subscribing to\n * each input Observable.\n * @return {Observable} An Observable of projected values from the most recent\n * values from each input Observable, or an array of the most recent values from\n * each input Observable.\n */\nexport function combineLatest<O extends ObservableInput<any>, R>(...args: any[]): Observable<R> | Observable<ObservedValueOf<O>[]> {\n  const scheduler = popScheduler(args);\n  const resultSelector = popResultSelector(args);\n\n  const { args: observables, keys } = argsArgArrayOrObject(args);\n\n  if (observables.length === 0) {\n    // If no observables are passed, or someone has passed an empty array\n    // of observables, or even an empty object POJO, we need to just\n    // complete (EMPTY), but we have to honor the scheduler provided if any.\n    return from([], scheduler as any);\n  }\n\n  const result = new Observable<ObservedValueOf<O>[]>(\n    combineLatestInit(\n      observables as ObservableInput<ObservedValueOf<O>>[],\n      scheduler,\n      keys\n        ? // A handler for scrubbing the array of args into a dictionary.\n          (values) => createObject(keys, values)\n        : // A passthrough to just return the array\n          identity\n    )\n  );\n\n  return resultSelector ? (result.pipe(mapOneOrManyArgs(resultSelector)) as Observable<R>) : result;\n}\n\nexport function combineLatestInit(\n  observables: ObservableInput<any>[],\n  scheduler?: SchedulerLike,\n  valueTransform: (values: any[]) => any = identity\n) {\n  return (subscriber: Subscriber<any>) => {\n    // The outer subscription. We're capturing this in a function\n    // because we may have to schedule it.\n    maybeSchedule(\n      scheduler,\n      () => {\n        const { length } = observables;\n        // A store for the values each observable has emitted so far. We match observable to value on index.\n        const values = new Array(length);\n        // The number of currently active subscriptions, as they complete, we decrement this number to see if\n        // we are all done combining values, so we can complete the result.\n        let active = length;\n        // The number of inner sources that still haven't emitted the first value\n        // We need to track this because all sources need to emit one value in order\n        // to start emitting values.\n        let remainingFirstValues = length;\n        // The loop to kick off subscription. We're keying everything on index `i` to relate the observables passed\n        // in to the slot in the output array or the key in the array of keys in the output dictionary.\n        for (let i = 0; i < length; i++) {\n          maybeSchedule(\n            scheduler,\n            () => {\n              const source = from(observables[i], scheduler as any);\n              let hasFirstValue = false;\n              source.subscribe(\n                createOperatorSubscriber(\n                  subscriber,\n                  (value) => {\n                    // When we get a value, record it in our set of values.\n                    values[i] = value;\n                    if (!hasFirstValue) {\n                      // If this is our first value, record that.\n                      hasFirstValue = true;\n                      remainingFirstValues--;\n                    }\n                    if (!remainingFirstValues) {\n                      // We're not waiting for any more\n                      // first values, so we can emit!\n                      subscriber.next(valueTransform(values.slice()));\n                    }\n                  },\n                  () => {\n                    if (!--active) {\n                      // We only complete the result if we have no more active\n                      // inner observables.\n                      subscriber.complete();\n                    }\n                  }\n                )\n              );\n            },\n            subscriber\n          );\n        }\n      },\n      subscriber\n    );\n  };\n}\n\n/**\n * A small utility to handle the couple of locations where we want to schedule if a scheduler was provided,\n * but we don't if there was no scheduler.\n */\nfunction maybeSchedule(scheduler: SchedulerLike | undefined, execute: () => void, subscription: Subscription) {\n  if (scheduler) {\n    executeSchedule(subscription, scheduler, execute);\n  } else {\n    execute();\n  }\n}\n", "import { Observable } from '../Observable';\nimport { innerFrom } from '../observable/innerFrom';\nimport { Subscriber } from '../Subscriber';\nimport { ObservableInput, SchedulerLike } from '../types';\nimport { executeSchedule } from '../util/executeSchedule';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\n\n/**\n * A process embodying the general \"merge\" strategy. This is used in\n * `mergeMap` and `mergeScan` because the logic is otherwise nearly identical.\n * @param source The original source observable\n * @param subscriber The consumer subscriber\n * @param project The projection function to get our inner sources\n * @param concurrent The number of concurrent inner subscriptions\n * @param onBeforeNext Additional logic to apply before nexting to our consumer\n * @param expand If `true` this will perform an \"expand\" strategy, which differs only\n * in that it recurses, and the inner subscription must be schedule-able.\n * @param innerSubScheduler A scheduler to use to schedule inner subscriptions,\n * this is to support the expand strategy, mostly, and should be deprecated\n */\nexport function mergeInternals<T, R>(\n  source: Observable<T>,\n  subscriber: Subscriber<R>,\n  project: (value: T, index: number) => ObservableInput<R>,\n  concurrent: number,\n  onBeforeNext?: (innerValue: R) => void,\n  expand?: boolean,\n  innerSubScheduler?: SchedulerLike,\n  additionalFinalizer?: () => void\n) {\n  // Buffered values, in the event of going over our concurrency limit\n  const buffer: T[] = [];\n  // The number of active inner subscriptions.\n  let active = 0;\n  // An index to pass to our accumulator function\n  let index = 0;\n  // Whether or not the outer source has completed.\n  let isComplete = false;\n\n  /**\n   * Checks to see if we can complete our result or not.\n   */\n  const checkComplete = () => {\n    // If the outer has completed, and nothing is left in the buffer,\n    // and we don't have any active inner subscriptions, then we can\n    // Emit the state and complete.\n    if (isComplete && !buffer.length && !active) {\n      subscriber.complete();\n    }\n  };\n\n  // If we're under our concurrency limit, just start the inner subscription, otherwise buffer and wait.\n  const outerNext = (value: T) => (active < concurrent ? doInnerSub(value) : buffer.push(value));\n\n  const doInnerSub = (value: T) => {\n    // If we're expanding, we need to emit the outer values and the inner values\n    // as the inners will \"become outers\" in a way as they are recursively fed\n    // back to the projection mechanism.\n    expand && subscriber.next(value as any);\n\n    // Increment the number of active subscriptions so we can track it\n    // against our concurrency limit later.\n    active++;\n\n    // A flag used to show that the inner observable completed.\n    // This is checked during finalization to see if we should\n    // move to the next item in the buffer, if there is on.\n    let innerComplete = false;\n\n    // Start our inner subscription.\n    innerFrom(project(value, index++)).subscribe(\n      createOperatorSubscriber(\n        subscriber,\n        (innerValue) => {\n          // `mergeScan` has additional handling here. For example\n          // taking the inner value and updating state.\n          onBeforeNext?.(innerValue);\n\n          if (expand) {\n            // If we're expanding, then just recurse back to our outer\n            // handler. It will emit the value first thing.\n            outerNext(innerValue as any);\n          } else {\n            // Otherwise, emit the inner value.\n            subscriber.next(innerValue);\n          }\n        },\n        () => {\n          // Flag that we have completed, so we know to check the buffer\n          // during finalization.\n          innerComplete = true;\n        },\n        // Errors are passed to the destination.\n        undefined,\n        () => {\n          // During finalization, if the inner completed (it wasn't errored or\n          // cancelled), then we want to try the next item in the buffer if\n          // there is one.\n          if (innerComplete) {\n            // We have to wrap this in a try/catch because it happens during\n            // finalization, possibly asynchronously, and we want to pass\n            // any errors that happen (like in a projection function) to\n            // the outer Subscriber.\n            try {\n              // INNER SOURCE COMPLETE\n              // Decrement the active count to ensure that the next time\n              // we try to call `doInnerSub`, the number is accurate.\n              active--;\n              // If we have more values in the buffer, try to process those\n              // Note that this call will increment `active` ahead of the\n              // next conditional, if there were any more inner subscriptions\n              // to start.\n              while (buffer.length && active < concurrent) {\n                const bufferedValue = buffer.shift()!;\n                // Particularly for `expand`, we need to check to see if a scheduler was provided\n                // for when we want to start our inner subscription. Otherwise, we just start\n                // are next inner subscription.\n                if (innerSubScheduler) {\n                  executeSchedule(subscriber, innerSubScheduler, () => doInnerSub(bufferedValue));\n                } else {\n                  doInnerSub(bufferedValue);\n                }\n              }\n              // Check to see if we can complete, and complete if so.\n              checkComplete();\n            } catch (err) {\n              subscriber.error(err);\n            }\n          }\n        }\n      )\n    );\n  };\n\n  // Subscribe to our source observable.\n  source.subscribe(\n    createOperatorSubscriber(subscriber, outerNext, () => {\n      // Outer completed, make a note of it, and check to see if we can complete everything.\n      isComplete = true;\n      checkComplete();\n    })\n  );\n\n  // Additional finalization (for when the destination is torn down).\n  // Other finalization is added implicitly via subscription above.\n  return () => {\n    additionalFinalizer?.();\n  };\n}\n", "import { ObservableInput, OperatorFunction, ObservedValueOf } from '../types';\nimport { map } from './map';\nimport { innerFrom } from '../observable/innerFrom';\nimport { operate } from '../util/lift';\nimport { mergeInternals } from './mergeInternals';\nimport { isFunction } from '../util/isFunction';\n\n/* tslint:disable:max-line-length */\nexport function mergeMap<T, O extends ObservableInput<any>>(\n  project: (value: T, index: number) => O,\n  concurrent?: number\n): OperatorFunction<T, ObservedValueOf<O>>;\n/** @deprecated The `resultSelector` parameter will be removed in v8. Use an inner `map` instead. Details: https://rxjs.dev/deprecations/resultSelector */\nexport function mergeMap<T, O extends ObservableInput<any>>(\n  project: (value: T, index: number) => O,\n  resultSelector: undefined,\n  concurrent?: number\n): OperatorFunction<T, ObservedValueOf<O>>;\n/** @deprecated The `resultSelector` parameter will be removed in v8. Use an inner `map` instead. Details: https://rxjs.dev/deprecations/resultSelector */\nexport function mergeMap<T, R, O extends ObservableInput<any>>(\n  project: (value: T, index: number) => O,\n  resultSelector: (outerValue: T, innerValue: ObservedValueOf<O>, outerIndex: number, innerIndex: number) => R,\n  concurrent?: number\n): OperatorFunction<T, R>;\n/* tslint:enable:max-line-length */\n\n/**\n * Projects each source value to an Observable which is merged in the output\n * Observable.\n *\n * <span class=\"informal\">Maps each value to an Observable, then flattens all of\n * these inner Observables using {@link mergeAll}.</span>\n *\n * ![](mergeMap.png)\n *\n * Returns an Observable that emits items based on applying a function that you\n * supply to each item emitted by the source Observable, where that function\n * returns an Observable, and then merging those resulting Observables and\n * emitting the results of this merger.\n *\n * ## Example\n *\n * Map and flatten each letter to an Observable ticking every 1 second\n *\n * ```ts\n * import { of, mergeMap, interval, map } from 'rxjs';\n *\n * const letters = of('a', 'b', 'c');\n * const result = letters.pipe(\n *   mergeMap(x => interval(1000).pipe(map(i => x + i)))\n * );\n *\n * result.subscribe(x => console.log(x));\n *\n * // Results in the following:\n * // a0\n * // b0\n * // c0\n * // a1\n * // b1\n * // c1\n * // continues to list a, b, c every second with respective ascending integers\n * ```\n *\n * @see {@link concatMap}\n * @see {@link exhaustMap}\n * @see {@link merge}\n * @see {@link mergeAll}\n * @see {@link mergeMapTo}\n * @see {@link mergeScan}\n * @see {@link switchMap}\n *\n * @param {function(value: T, ?index: number): ObservableInput} project A function\n * that, when applied to an item emitted by the source Observable, returns an\n * Observable.\n * @param {number} [concurrent=Infinity] Maximum number of input\n * Observables being subscribed to concurrently.\n * @return A function that returns an Observable that emits the result of\n * applying the projection function (and the optional deprecated\n * `resultSelector`) to each item emitted by the source Observable and merging\n * the results of the Observables obtained from this transformation.\n */\nexport function mergeMap<T, R, O extends ObservableInput<any>>(\n  project: (value: T, index: number) => O,\n  resultSelector?: ((outerValue: T, innerValue: ObservedValueOf<O>, outerIndex: number, innerIndex: number) => R) | number,\n  concurrent: number = Infinity\n): OperatorFunction<T, ObservedValueOf<O> | R> {\n  if (isFunction(resultSelector)) {\n    // DEPRECATED PATH\n    return mergeMap((a, i) => map((b: any, ii: number) => resultSelector(a, b, i, ii))(innerFrom(project(a, i))), concurrent);\n  } else if (typeof resultSelector === 'number') {\n    concurrent = resultSelector;\n  }\n\n  return operate((source, subscriber) => mergeInternals(source, subscriber, project, concurrent));\n}\n", "import { mergeMap } from './mergeMap';\nimport { identity } from '../util/identity';\nimport { OperatorFunction, ObservableInput, ObservedValueOf } from '../types';\n\n/**\n * Converts a higher-order Observable into a first-order Observable which\n * concurrently delivers all values that are emitted on the inner Observables.\n *\n * <span class=\"informal\">Flattens an Observable-of-Observables.</span>\n *\n * ![](mergeAll.png)\n *\n * `mergeAll` subscribes to an Observable that emits Observables, also known as\n * a higher-order Observable. Each time it observes one of these emitted inner\n * Observables, it subscribes to that and delivers all the values from the\n * inner Observable on the output Observable. The output Observable only\n * completes once all inner Observables have completed. Any error delivered by\n * a inner Observable will be immediately emitted on the output Observable.\n *\n * ## Examples\n *\n * Spawn a new interval Observable for each click event, and blend their outputs as one Observable\n *\n * ```ts\n * import { fromEvent, map, interval, mergeAll } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const higherOrder = clicks.pipe(map(() => interval(1000)));\n * const firstOrder = higherOrder.pipe(mergeAll());\n *\n * firstOrder.subscribe(x => console.log(x));\n * ```\n *\n * Count from 0 to 9 every second for each click, but only allow 2 concurrent timers\n *\n * ```ts\n * import { fromEvent, map, interval, take, mergeAll } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const higherOrder = clicks.pipe(\n *   map(() => interval(1000).pipe(take(10)))\n * );\n * const firstOrder = higherOrder.pipe(mergeAll(2));\n *\n * firstOrder.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link combineLatestAll}\n * @see {@link concatAll}\n * @see {@link exhaustAll}\n * @see {@link merge}\n * @see {@link mergeMap}\n * @see {@link mergeMapTo}\n * @see {@link mergeScan}\n * @see {@link switchAll}\n * @see {@link switchMap}\n * @see {@link zipAll}\n *\n * @param {number} [concurrent=Infinity] Maximum number of inner\n * Observables being subscribed to concurrently.\n * @return A function that returns an Observable that emits values coming from\n * all the inner Observables emitted by the source Observable.\n */\nexport function mergeAll<O extends ObservableInput<any>>(concurrent: number = Infinity): OperatorFunction<O, ObservedValueOf<O>> {\n  return mergeMap(identity, concurrent);\n}\n", "import { mergeAll } from './mergeAll';\nimport { OperatorFunction, ObservableInput, ObservedValueOf } from '../types';\n\n/**\n * Converts a higher-order Observable into a first-order Observable by\n * concatenating the inner Observables in order.\n *\n * <span class=\"informal\">Flattens an Observable-of-Observables by putting one\n * inner Observable after the other.</span>\n *\n * ![](concatAll.svg)\n *\n * Joins every Observable emitted by the source (a higher-order Observable), in\n * a serial fashion. It subscribes to each inner Observable only after the\n * previous inner Observable has completed, and merges all of their values into\n * the returned observable.\n *\n * __Warning:__ If the source Observable emits Observables quickly and\n * endlessly, and the inner Observables it emits generally complete slower than\n * the source emits, you can run into memory issues as the incoming Observables\n * collect in an unbounded buffer.\n *\n * Note: `concatAll` is equivalent to `mergeAll` with concurrency parameter set\n * to `1`.\n *\n * ## Example\n *\n * For each click event, tick every second from 0 to 3, with no concurrency\n *\n * ```ts\n * import { fromEvent, map, interval, take, concatAll } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const higherOrder = clicks.pipe(\n *   map(() => interval(1000).pipe(take(4)))\n * );\n * const firstOrder = higherOrder.pipe(concatAll());\n * firstOrder.subscribe(x => console.log(x));\n *\n * // Results in the following:\n * // (results are not concurrent)\n * // For every click on the \"document\" it will emit values 0 to 3 spaced\n * // on a 1000ms interval\n * // one click = 1000ms-> 0 -1000ms-> 1 -1000ms-> 2 -1000ms-> 3\n * ```\n *\n * @see {@link combineLatestAll}\n * @see {@link concat}\n * @see {@link concatMap}\n * @see {@link concatMapTo}\n * @see {@link exhaustAll}\n * @see {@link mergeAll}\n * @see {@link switchAll}\n * @see {@link switchMap}\n * @see {@link zipAll}\n *\n * @return A function that returns an Observable emitting values from all the\n * inner Observables concatenated.\n */\nexport function concatAll<O extends ObservableInput<any>>(): OperatorFunction<O, ObservedValueOf<O>> {\n  return mergeAll(1);\n}\n", "import { Observable } from '../Observable';\nimport { ObservableInputTuple, SchedulerLike } from '../types';\nimport { concatAll } from '../operators/concatAll';\nimport { popScheduler } from '../util/args';\nimport { from } from './from';\n\nexport function concat<T extends readonly unknown[]>(...inputs: [...ObservableInputTuple<T>]): Observable<T[number]>;\nexport function concat<T extends readonly unknown[]>(\n  ...inputsAndScheduler: [...ObservableInputTuple<T>, SchedulerLike]\n): Observable<T[number]>;\n\n/**\n * Creates an output Observable which sequentially emits all values from the first given\n * Observable and then moves on to the next.\n *\n * <span class=\"informal\">Concatenates multiple Observables together by\n * sequentially emitting their values, one Observable after the other.</span>\n *\n * ![](concat.png)\n *\n * `concat` joins multiple Observables together, by subscribing to them one at a time and\n * merging their results into the output Observable. You can pass either an array of\n * Observables, or put them directly as arguments. Passing an empty array will result\n * in Observable that completes immediately.\n *\n * `concat` will subscribe to first input Observable and emit all its values, without\n * changing or affecting them in any way. When that Observable completes, it will\n * subscribe to then next Observable passed and, again, emit its values. This will be\n * repeated, until the operator runs out of Observables. When last input Observable completes,\n * `concat` will complete as well. At any given moment only one Observable passed to operator\n * emits values. If you would like to emit values from passed Observables concurrently, check out\n * {@link merge} instead, especially with optional `concurrent` parameter. As a matter of fact,\n * `concat` is an equivalent of `merge` operator with `concurrent` parameter set to `1`.\n *\n * Note that if some input Observable never completes, `concat` will also never complete\n * and Observables following the one that did not complete will never be subscribed. On the other\n * hand, if some Observable simply completes immediately after it is subscribed, it will be\n * invisible for `concat`, which will just move on to the next Observable.\n *\n * If any Observable in chain errors, instead of passing control to the next Observable,\n * `concat` will error immediately as well. Observables that would be subscribed after\n * the one that emitted error, never will.\n *\n * If you pass to `concat` the same Observable many times, its stream of values\n * will be \"replayed\" on every subscription, which means you can repeat given Observable\n * as many times as you like. If passing the same Observable to `concat` 1000 times becomes tedious,\n * you can always use {@link repeat}.\n *\n * ## Examples\n *\n * Concatenate a timer counting from 0 to 3 with a synchronous sequence from 1 to 10\n *\n * ```ts\n * import { interval, take, range, concat } from 'rxjs';\n *\n * const timer = interval(1000).pipe(take(4));\n * const sequence = range(1, 10);\n * const result = concat(timer, sequence);\n * result.subscribe(x => console.log(x));\n *\n * // results in:\n * // 0 -1000ms-> 1 -1000ms-> 2 -1000ms-> 3 -immediate-> 1 ... 10\n * ```\n *\n * Concatenate 3 Observables\n *\n * ```ts\n * import { interval, take, concat } from 'rxjs';\n *\n * const timer1 = interval(1000).pipe(take(10));\n * const timer2 = interval(2000).pipe(take(6));\n * const timer3 = interval(500).pipe(take(10));\n *\n * const result = concat(timer1, timer2, timer3);\n * result.subscribe(x => console.log(x));\n *\n * // results in the following:\n * // (Prints to console sequentially)\n * // -1000ms-> 0 -1000ms-> 1 -1000ms-> ... 9\n * // -2000ms-> 0 -2000ms-> 1 -2000ms-> ... 5\n * // -500ms-> 0 -500ms-> 1 -500ms-> ... 9\n * ```\n *\n * Concatenate the same Observable to repeat it\n *\n * ```ts\n * import { interval, take, concat } from 'rxjs';\n *\n * const timer = interval(1000).pipe(take(2));\n *\n * concat(timer, timer) // concatenating the same Observable!\n *   .subscribe({\n *     next: value => console.log(value),\n *     complete: () => console.log('...and it is done!')\n *   });\n *\n * // Logs:\n * // 0 after 1s\n * // 1 after 2s\n * // 0 after 3s\n * // 1 after 4s\n * // '...and it is done!' also after 4s\n * ```\n *\n * @see {@link concatAll}\n * @see {@link concatMap}\n * @see {@link concatMapTo}\n * @see {@link startWith}\n * @see {@link endWith}\n *\n * @param args Input Observables to concatenate.\n */\nexport function concat(...args: any[]): Observable<unknown> {\n  return concatAll()(from(args, popScheduler(args)));\n}\n", "import { Observable } from '../Observable';\nimport { ObservedValueOf, ObservableInput } from '../types';\nimport { innerFrom } from './innerFrom';\n\n/**\n * Creates an Observable that, on subscribe, calls an Observable factory to\n * make an Observable for each new Observer.\n *\n * <span class=\"informal\">Creates the Observable lazily, that is, only when it\n * is subscribed.\n * </span>\n *\n * ![](defer.png)\n *\n * `defer` allows you to create an Observable only when the Observer\n * subscribes. It waits until an Observer subscribes to it, calls the given\n * factory function to get an Observable -- where a factory function typically\n * generates a new Observable -- and subscribes the Observer to this Observable.\n * In case the factory function returns a falsy value, then EMPTY is used as\n * Observable instead. Last but not least, an exception during the factory\n * function call is transferred to the Observer by calling `error`.\n *\n * ## Example\n *\n * Subscribe to either an Observable of clicks or an Observable of interval, at random\n *\n * ```ts\n * import { defer, fromEvent, interval } from 'rxjs';\n *\n * const clicksOrInterval = defer(() => {\n *   return Math.random() > 0.5\n *     ? fromEvent(document, 'click')\n *     : interval(1000);\n * });\n * clicksOrInterval.subscribe(x => console.log(x));\n *\n * // Results in the following behavior:\n * // If the result of Math.random() is greater than 0.5 it will listen\n * // for clicks anywhere on the \"document\"; when document is clicked it\n * // will log a MouseEvent object to the console. If the result is less\n * // than 0.5 it will emit ascending numbers, one every second(1000ms).\n * ```\n *\n * @see {@link Observable}\n *\n * @param {function(): ObservableInput} observableFactory The Observable\n * factory function to invoke for each Observer that subscribes to the output\n * Observable. May also return a Promise, which will be converted on the fly\n * to an Observable.\n * @return {Observable} An Observable whose Observers' subscriptions trigger\n * an invocation of the given Observable factory function.\n */\nexport function defer<R extends ObservableInput<any>>(observableFactory: () => R): Observable<ObservedValueOf<R>> {\n  return new Observable<ObservedValueOf<R>>((subscriber) => {\n    innerFrom(observableFactory()).subscribe(subscriber);\n  });\n}\n", "import { innerFrom } from '../observable/innerFrom';\nimport { Observable } from '../Observable';\nimport { mergeMap } from '../operators/mergeMap';\nimport { isArrayLike } from '../util/isArrayLike';\nimport { isFunction } from '../util/isFunction';\nimport { mapOneOrManyArgs } from '../util/mapOneOrManyArgs';\n\n// These constants are used to create handler registry functions using array mapping below.\nconst nodeEventEmitterMethods = ['addListener', 'removeListener'] as const;\nconst eventTargetMethods = ['addEventListener', 'removeEventListener'] as const;\nconst jqueryMethods = ['on', 'off'] as const;\n\nexport interface NodeStyleEventEmitter {\n  addListener(eventName: string | symbol, handler: NodeEventHandler): this;\n  removeListener(eventName: string | symbol, handler: NodeEventHandler): this;\n}\n\nexport type NodeEventHandler = (...args: any[]) => void;\n\n// For APIs that implement `addListener` and `removeListener` methods that may\n// not use the same arguments or return EventEmitter values\n// such as React Native\nexport interface NodeCompatibleEventEmitter {\n  addListener(eventName: string, handler: NodeEventHandler): void | {};\n  removeListener(eventName: string, handler: NodeEventHandler): void | {};\n}\n\n// Use handler types like those in @types/jquery. See:\n// https://github.com/DefinitelyTyped/DefinitelyTyped/blob/847731ba1d7fa6db6b911c0e43aa0afe596e7723/types/jquery/misc.d.ts#L6395\nexport interface JQueryStyleEventEmitter<TContext, T> {\n  on(eventName: string, handler: (this: TContext, t: T, ...args: any[]) => any): void;\n  off(eventName: string, handler: (this: TContext, t: T, ...args: any[]) => any): void;\n}\n\nexport interface EventListenerObject<E> {\n  handleEvent(evt: E): void;\n}\n\nexport interface HasEventTargetAddRemove<E> {\n  addEventListener(\n    type: string,\n    listener: ((evt: E) => void) | EventListenerObject<E> | null,\n    options?: boolean | AddEventListenerOptions\n  ): void;\n  removeEventListener(\n    type: string,\n    listener: ((evt: E) => void) | EventListenerObject<E> | null,\n    options?: EventListenerOptions | boolean\n  ): void;\n}\n\nexport interface EventListenerOptions {\n  capture?: boolean;\n  passive?: boolean;\n  once?: boolean;\n}\n\nexport interface AddEventListenerOptions extends EventListenerOptions {\n  once?: boolean;\n  passive?: boolean;\n}\n\nexport function fromEvent<T>(target: HasEventTargetAddRemove<T> | ArrayLike<HasEventTargetAddRemove<T>>, eventName: string): Observable<T>;\nexport function fromEvent<T, R>(\n  target: HasEventTargetAddRemove<T> | ArrayLike<HasEventTargetAddRemove<T>>,\n  eventName: string,\n  resultSelector: (event: T) => R\n): Observable<R>;\nexport function fromEvent<T>(\n  target: HasEventTargetAddRemove<T> | ArrayLike<HasEventTargetAddRemove<T>>,\n  eventName: string,\n  options: EventListenerOptions\n): Observable<T>;\nexport function fromEvent<T, R>(\n  target: HasEventTargetAddRemove<T> | ArrayLike<HasEventTargetAddRemove<T>>,\n  eventName: string,\n  options: EventListenerOptions,\n  resultSelector: (event: T) => R\n): Observable<R>;\n\nexport function fromEvent(target: NodeStyleEventEmitter | ArrayLike<NodeStyleEventEmitter>, eventName: string): Observable<unknown>;\n/** @deprecated Do not specify explicit type parameters. Signatures with type parameters that cannot be inferred will be removed in v8. */\nexport function fromEvent<T>(target: NodeStyleEventEmitter | ArrayLike<NodeStyleEventEmitter>, eventName: string): Observable<T>;\nexport function fromEvent<R>(\n  target: NodeStyleEventEmitter | ArrayLike<NodeStyleEventEmitter>,\n  eventName: string,\n  resultSelector: (...args: any[]) => R\n): Observable<R>;\n\nexport function fromEvent(\n  target: NodeCompatibleEventEmitter | ArrayLike<NodeCompatibleEventEmitter>,\n  eventName: string\n): Observable<unknown>;\n/** @deprecated Do not specify explicit type parameters. Signatures with type parameters that cannot be inferred will be removed in v8. */\nexport function fromEvent<T>(target: NodeCompatibleEventEmitter | ArrayLike<NodeCompatibleEventEmitter>, eventName: string): Observable<T>;\nexport function fromEvent<R>(\n  target: NodeCompatibleEventEmitter | ArrayLike<NodeCompatibleEventEmitter>,\n  eventName: string,\n  resultSelector: (...args: any[]) => R\n): Observable<R>;\n\nexport function fromEvent<T>(\n  target: JQueryStyleEventEmitter<any, T> | ArrayLike<JQueryStyleEventEmitter<any, T>>,\n  eventName: string\n): Observable<T>;\nexport function fromEvent<T, R>(\n  target: JQueryStyleEventEmitter<any, T> | ArrayLike<JQueryStyleEventEmitter<any, T>>,\n  eventName: string,\n  resultSelector: (value: T, ...args: any[]) => R\n): Observable<R>;\n\n/**\n * Creates an Observable that emits events of a specific type coming from the\n * given event target.\n *\n * <span class=\"informal\">Creates an Observable from DOM events, or Node.js\n * EventEmitter events or others.</span>\n *\n * ![](fromEvent.png)\n *\n * `fromEvent` accepts as a first argument event target, which is an object with methods\n * for registering event handler functions. As a second argument it takes string that indicates\n * type of event we want to listen for. `fromEvent` supports selected types of event targets,\n * which are described in detail below. If your event target does not match any of the ones listed,\n * you should use {@link fromEventPattern}, which can be used on arbitrary APIs.\n * When it comes to APIs supported by `fromEvent`, their methods for adding and removing event\n * handler functions have different names, but they all accept a string describing event type\n * and function itself, which will be called whenever said event happens.\n *\n * Every time resulting Observable is subscribed, event handler function will be registered\n * to event target on given event type. When that event fires, value\n * passed as a first argument to registered function will be emitted by output Observable.\n * When Observable is unsubscribed, function will be unregistered from event target.\n *\n * Note that if event target calls registered function with more than one argument, second\n * and following arguments will not appear in resulting stream. In order to get access to them,\n * you can pass to `fromEvent` optional project function, which will be called with all arguments\n * passed to event handler. Output Observable will then emit value returned by project function,\n * instead of the usual value.\n *\n * Remember that event targets listed below are checked via duck typing. It means that\n * no matter what kind of object you have and no matter what environment you work in,\n * you can safely use `fromEvent` on that object if it exposes described methods (provided\n * of course they behave as was described above). So for example if Node.js library exposes\n * event target which has the same method names as DOM EventTarget, `fromEvent` is still\n * a good choice.\n *\n * If the API you use is more callback then event handler oriented (subscribed\n * callback function fires only once and thus there is no need to manually\n * unregister it), you should use {@link bindCallback} or {@link bindNodeCallback}\n * instead.\n *\n * `fromEvent` supports following types of event targets:\n *\n * **DOM EventTarget**\n *\n * This is an object with `addEventListener` and `removeEventListener` methods.\n *\n * In the browser, `addEventListener` accepts - apart from event type string and event\n * handler function arguments - optional third parameter, which is either an object or boolean,\n * both used for additional configuration how and when passed function will be called. When\n * `fromEvent` is used with event target of that type, you can provide this values\n * as third parameter as well.\n *\n * **Node.js EventEmitter**\n *\n * An object with `addListener` and `removeListener` methods.\n *\n * **JQuery-style event target**\n *\n * An object with `on` and `off` methods\n *\n * **DOM NodeList**\n *\n * List of DOM Nodes, returned for example by `document.querySelectorAll` or `Node.childNodes`.\n *\n * Although this collection is not event target in itself, `fromEvent` will iterate over all Nodes\n * it contains and install event handler function in every of them. When returned Observable\n * is unsubscribed, function will be removed from all Nodes.\n *\n * **DOM HtmlCollection**\n *\n * Just as in case of NodeList it is a collection of DOM nodes. Here as well event handler function is\n * installed and removed in each of elements.\n *\n *\n * ## Examples\n *\n * Emit clicks happening on the DOM document\n *\n * ```ts\n * import { fromEvent } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * clicks.subscribe(x => console.log(x));\n *\n * // Results in:\n * // MouseEvent object logged to console every time a click\n * // occurs on the document.\n * ```\n *\n * Use `addEventListener` with capture option\n *\n * ```ts\n * import { fromEvent } from 'rxjs';\n *\n * const div = document.createElement('div');\n * div.style.cssText = 'width: 200px; height: 200px; background: #09c;';\n * document.body.appendChild(div);\n *\n * // note optional configuration parameter which will be passed to addEventListener\n * const clicksInDocument = fromEvent(document, 'click', { capture: true });\n * const clicksInDiv = fromEvent(div, 'click');\n *\n * clicksInDocument.subscribe(() => console.log('document'));\n * clicksInDiv.subscribe(() => console.log('div'));\n *\n * // By default events bubble UP in DOM tree, so normally\n * // when we would click on div in document\n * // \"div\" would be logged first and then \"document\".\n * // Since we specified optional `capture` option, document\n * // will catch event when it goes DOWN DOM tree, so console\n * // will log \"document\" and then \"div\".\n * ```\n *\n * @see {@link bindCallback}\n * @see {@link bindNodeCallback}\n * @see {@link fromEventPattern}\n *\n * @param {FromEventTarget<T>} target The DOM EventTarget, Node.js\n * EventEmitter, JQuery-like event target, NodeList or HTMLCollection to attach the event handler to.\n * @param {string} eventName The event name of interest, being emitted by the\n * `target`.\n * @param {EventListenerOptions} [options] Options to pass through to addEventListener\n * @return {Observable<T>}\n */\nexport function fromEvent<T>(\n  target: any,\n  eventName: string,\n  options?: EventListenerOptions | ((...args: any[]) => T),\n  resultSelector?: (...args: any[]) => T\n): Observable<T> {\n  if (isFunction(options)) {\n    resultSelector = options;\n    options = undefined;\n  }\n  if (resultSelector) {\n    return fromEvent<T>(target, eventName, options as EventListenerOptions).pipe(mapOneOrManyArgs(resultSelector));\n  }\n\n  // Figure out our add and remove methods. In order to do this,\n  // we are going to analyze the target in a preferred order, if\n  // the target matches a given signature, we take the two \"add\" and \"remove\"\n  // method names and apply them to a map to create opposite versions of the\n  // same function. This is because they all operate in duplicate pairs,\n  // `addListener(name, handler)`, `removeListener(name, handler)`, for example.\n  // The call only differs by method name, as to whether or not you're adding or removing.\n  const [add, remove] =\n    // If it is an EventTarget, we need to use a slightly different method than the other two patterns.\n    isEventTarget(target)\n      ? eventTargetMethods.map((methodName) => (handler: any) => target[methodName](eventName, handler, options as EventListenerOptions))\n      : // In all other cases, the call pattern is identical with the exception of the method names.\n      isNodeStyleEventEmitter(target)\n      ? nodeEventEmitterMethods.map(toCommonHandlerRegistry(target, eventName))\n      : isJQueryStyleEventEmitter(target)\n      ? jqueryMethods.map(toCommonHandlerRegistry(target, eventName))\n      : [];\n\n  // If add is falsy, it's because we didn't match a pattern above.\n  // Check to see if it is an ArrayLike, because if it is, we want to\n  // try to apply fromEvent to all of it's items. We do this check last,\n  // because there are may be some types that are both ArrayLike *and* implement\n  // event registry points, and we'd rather delegate to that when possible.\n  if (!add) {\n    if (isArrayLike(target)) {\n      return mergeMap((subTarget: any) => fromEvent(subTarget, eventName, options as EventListenerOptions))(\n        innerFrom(target)\n      ) as Observable<T>;\n    }\n  }\n\n  // If add is falsy and we made it here, it's because we didn't\n  // match any valid target objects above.\n  if (!add) {\n    throw new TypeError('Invalid event target');\n  }\n\n  return new Observable<T>((subscriber) => {\n    // The handler we are going to register. Forwards the event object, by itself, or\n    // an array of arguments to the event handler, if there is more than one argument,\n    // to the consumer.\n    const handler = (...args: any[]) => subscriber.next(1 < args.length ? args : args[0]);\n    // Do the work of adding the handler to the target.\n    add(handler);\n    // When we finalize, we want to remove the handler and free up memory.\n    return () => remove!(handler);\n  });\n}\n\n/**\n * Used to create `add` and `remove` functions to register and unregister event handlers\n * from a target in the most common handler pattern, where there are only two arguments.\n * (e.g.  `on(name, fn)`, `off(name, fn)`, `addListener(name, fn)`, or `removeListener(name, fn)`)\n * @param target The target we're calling methods on\n * @param eventName The event name for the event we're creating register or unregister functions for\n */\nfunction toCommonHandlerRegistry(target: any, eventName: string) {\n  return (methodName: string) => (handler: any) => target[methodName](eventName, handler);\n}\n\n/**\n * Checks to see if the target implements the required node-style EventEmitter methods\n * for adding and removing event handlers.\n * @param target the object to check\n */\nfunction isNodeStyleEventEmitter(target: any): target is NodeStyleEventEmitter {\n  return isFunction(target.addListener) && isFunction(target.removeListener);\n}\n\n/**\n * Checks to see if the target implements the required jQuery-style EventEmitter methods\n * for adding and removing event handlers.\n * @param target the object to check\n */\nfunction isJQueryStyleEventEmitter(target: any): target is JQueryStyleEventEmitter<any, any> {\n  return isFunction(target.on) && isFunction(target.off);\n}\n\n/**\n * Checks to see if the target implements the required EventTarget methods\n * for adding and removing event handlers.\n * @param target the object to check\n */\nfunction isEventTarget(target: any): target is HasEventTargetAddRemove<any> {\n  return isFunction(target.addEventListener) && isFunction(target.removeEventListener);\n}\n", "import { Observable } from '../Observable';\nimport { isFunction } from '../util/isFunction';\nimport { NodeEventHandler } from './fromEvent';\nimport { mapOneOrManyArgs } from '../util/mapOneOrManyArgs';\n\n/* tslint:disable:max-line-length */\nexport function fromEventPattern<T>(\n  addHandler: (handler: NodeEventHandler) => any,\n  removeHandler?: (handler: NodeEventHandler, signal?: any) => void\n): Observable<T>;\nexport function fromEventPattern<T>(\n  addHandler: (handler: NodeEventHandler) => any,\n  removeHandler?: (handler: NodeEventHandler, signal?: any) => void,\n  resultSelector?: (...args: any[]) => T\n): Observable<T>;\n/* tslint:enable:max-line-length */\n\n/**\n * Creates an Observable from an arbitrary API for registering event handlers.\n *\n * <span class=\"informal\">When that method for adding event handler was something {@link fromEvent}\n * was not prepared for.</span>\n *\n * ![](fromEventPattern.png)\n *\n * `fromEventPattern` allows you to convert into an Observable any API that supports registering handler functions\n * for events. It is similar to {@link fromEvent}, but far\n * more flexible. In fact, all use cases of {@link fromEvent} could be easily handled by\n * `fromEventPattern` (although in slightly more verbose way).\n *\n * This operator accepts as a first argument an `addHandler` function, which will be injected with\n * handler parameter. That handler is actually an event handler function that you now can pass\n * to API expecting it. `addHandler` will be called whenever Observable\n * returned by the operator is subscribed, so registering handler in API will not\n * necessarily happen when `fromEventPattern` is called.\n *\n * After registration, every time an event that we listen to happens,\n * Observable returned by `fromEventPattern` will emit value that event handler\n * function was called with. Note that if event handler was called with more\n * than one argument, second and following arguments will not appear in the Observable.\n *\n * If API you are using allows to unregister event handlers as well, you can pass to `fromEventPattern`\n * another function - `removeHandler` - as a second parameter. It will be injected\n * with the same handler function as before, which now you can use to unregister\n * it from the API. `removeHandler` will be called when consumer of resulting Observable\n * unsubscribes from it.\n *\n * In some APIs unregistering is actually handled differently. Method registering an event handler\n * returns some kind of token, which is later used to identify which function should\n * be unregistered or it itself has method that unregisters event handler.\n * If that is the case with your API, make sure token returned\n * by registering method is returned by `addHandler`. Then it will be passed\n * as a second argument to `removeHandler`, where you will be able to use it.\n *\n * If you need access to all event handler parameters (not only the first one),\n * or you need to transform them in any way, you can call `fromEventPattern` with optional\n * third parameter - project function which will accept all arguments passed to\n * event handler when it is called. Whatever is returned from project function will appear on\n * resulting stream instead of usual event handlers first argument. This means\n * that default project can be thought of as function that takes its first parameter\n * and ignores the rest.\n *\n * ## Examples\n *\n * Emits clicks happening on the DOM document\n *\n * ```ts\n * import { fromEventPattern } from 'rxjs';\n *\n * function addClickHandler(handler) {\n *   document.addEventListener('click', handler);\n * }\n *\n * function removeClickHandler(handler) {\n *   document.removeEventListener('click', handler);\n * }\n *\n * const clicks = fromEventPattern(\n *   addClickHandler,\n *   removeClickHandler\n * );\n * clicks.subscribe(x => console.log(x));\n *\n * // Whenever you click anywhere in the browser, DOM MouseEvent\n * // object will be logged.\n * ```\n *\n * Use with API that returns cancellation token\n *\n * ```ts\n * import { fromEventPattern } from 'rxjs';\n *\n * const token = someAPI.registerEventHandler(function() {});\n * someAPI.unregisterEventHandler(token); // this APIs cancellation method accepts\n *                                        // not handler itself, but special token.\n *\n * const someAPIObservable = fromEventPattern(\n *   function(handler) { return someAPI.registerEventHandler(handler); }, // Note that we return the token here...\n *   function(handler, token) { someAPI.unregisterEventHandler(token); }  // ...to then use it here.\n * );\n * ```\n *\n * Use with project function\n *\n * ```ts\n * import { fromEventPattern } from 'rxjs';\n *\n * someAPI.registerEventHandler((eventType, eventMessage) => {\n *   console.log(eventType, eventMessage); // Logs 'EVENT_TYPE' 'EVENT_MESSAGE' to console.\n * });\n *\n * const someAPIObservable = fromEventPattern(\n *   handler => someAPI.registerEventHandler(handler),\n *   handler => someAPI.unregisterEventHandler(handler)\n *   (eventType, eventMessage) => eventType + ' --- ' + eventMessage // without that function only 'EVENT_TYPE'\n * );                                                                // would be emitted by the Observable\n *\n * someAPIObservable.subscribe(value => console.log(value));\n *\n * // Logs:\n * // 'EVENT_TYPE --- EVENT_MESSAGE'\n * ```\n *\n * @see {@link fromEvent}\n * @see {@link bindCallback}\n * @see {@link bindNodeCallback}\n *\n * @param {function(handler: Function): any} addHandler A function that takes\n * a `handler` function as argument and attaches it somehow to the actual\n * source of events.\n * @param {function(handler: Function, token?: any): void} [removeHandler] A function that\n * takes a `handler` function as an argument and removes it from the event source. If `addHandler`\n * returns some kind of token, `removeHandler` function will have it as a second parameter.\n * @param {function(...args: any): T} [project] A function to\n * transform results. It takes the arguments from the event handler and\n * should return a single value.\n * @return {Observable<T>} Observable which, when an event happens, emits first parameter\n * passed to registered event handler. Alternatively it emits whatever project function returns\n * at that moment.\n */\nexport function fromEventPattern<T>(\n  addHandler: (handler: NodeEventHandler) => any,\n  removeHandler?: (handler: NodeEventHandler, signal?: any) => void,\n  resultSelector?: (...args: any[]) => T\n): Observable<T | T[]> {\n  if (resultSelector) {\n    return fromEventPattern<T>(addHandler, removeHandler).pipe(mapOneOrManyArgs(resultSelector));\n  }\n\n  return new Observable<T | T[]>((subscriber) => {\n    const handler = (...e: T[]) => subscriber.next(e.length === 1 ? e[0] : e);\n    const retValue = addHandler(handler);\n    return isFunction(removeHandler) ? () => removeHandler(handler, retValue) : undefined;\n  });\n}\n", "import { Observable } from '../Observable';\nimport { SchedulerLike } from '../types';\nimport { async as asyncScheduler } from '../scheduler/async';\nimport { isScheduler } from '../util/isScheduler';\nimport { isValidDate } from '../util/isDate';\n\n/**\n * Creates an observable that will wait for a specified time period, or exact date, before\n * emitting the number 0.\n *\n * <span class=\"informal\">Used to emit a notification after a delay.</span>\n *\n * This observable is useful for creating delays in code, or racing against other values\n * for ad-hoc timeouts.\n *\n * The `delay` is specified by default in milliseconds, however providing a custom scheduler could\n * create a different behavior.\n *\n * ## Examples\n *\n * Wait 3 seconds and start another observable\n *\n * You might want to use `timer` to delay subscription to an\n * observable by a set amount of time. Here we use a timer with\n * {@link concatMapTo} or {@link concatMap} in order to wait\n * a few seconds and start a subscription to a source.\n *\n * ```ts\n * import { of, timer, concatMap } from 'rxjs';\n *\n * // This could be any observable\n * const source = of(1, 2, 3);\n *\n * timer(3000)\n *   .pipe(concatMap(() => source))\n *   .subscribe(console.log);\n * ```\n *\n * Take all values until the start of the next minute\n *\n * Using a `Date` as the trigger for the first emission, you can\n * do things like wait until midnight to fire an event, or in this case,\n * wait until a new minute starts (chosen so the example wouldn't take\n * too long to run) in order to stop watching a stream. Leveraging\n * {@link takeUntil}.\n *\n * ```ts\n * import { interval, takeUntil, timer } from 'rxjs';\n *\n * // Build a Date object that marks the\n * // next minute.\n * const currentDate = new Date();\n * const startOfNextMinute = new Date(\n *   currentDate.getFullYear(),\n *   currentDate.getMonth(),\n *   currentDate.getDate(),\n *   currentDate.getHours(),\n *   currentDate.getMinutes() + 1\n * );\n *\n * // This could be any observable stream\n * const source = interval(1000);\n *\n * const result = source.pipe(\n *   takeUntil(timer(startOfNextMinute))\n * );\n *\n * result.subscribe(console.log);\n * ```\n *\n * ### Known Limitations\n *\n * - The {@link asyncScheduler} uses `setTimeout` which has limitations for how far in the future it can be scheduled.\n *\n * - If a `scheduler` is provided that returns a timestamp other than an epoch from `now()`, and\n * a `Date` object is passed to the `dueTime` argument, the calculation for when the first emission\n * should occur will be incorrect. In this case, it would be best to do your own calculations\n * ahead of time, and pass a `number` in as the `dueTime`.\n *\n * @param due If a `number`, the amount of time in milliseconds to wait before emitting.\n * If a `Date`, the exact time at which to emit.\n * @param scheduler The scheduler to use to schedule the delay. Defaults to {@link asyncScheduler}.\n */\nexport function timer(due: number | Date, scheduler?: SchedulerLike): Observable<0>;\n\n/**\n * Creates an observable that starts an interval after a specified delay, emitting incrementing numbers -- starting at `0` --\n * on each interval after words.\n *\n * The `delay` and `intervalDuration` are specified by default in milliseconds, however providing a custom scheduler could\n * create a different behavior.\n *\n * ## Example\n *\n * ### Start an interval that starts right away\n *\n * Since {@link interval} waits for the passed delay before starting,\n * sometimes that's not ideal. You may want to start an interval immediately.\n * `timer` works well for this. Here we have both side-by-side so you can\n * see them in comparison.\n *\n * Note that this observable will never complete.\n *\n * ```ts\n * import { timer, interval } from 'rxjs';\n *\n * timer(0, 1000).subscribe(n => console.log('timer', n));\n * interval(1000).subscribe(n => console.log('interval', n));\n * ```\n *\n * ### Known Limitations\n *\n * - The {@link asyncScheduler} uses `setTimeout` which has limitations for how far in the future it can be scheduled.\n *\n * - If a `scheduler` is provided that returns a timestamp other than an epoch from `now()`, and\n * a `Date` object is passed to the `dueTime` argument, the calculation for when the first emission\n * should occur will be incorrect. In this case, it would be best to do your own calculations\n * ahead of time, and pass a `number` in as the `startDue`.\n * @param startDue If a `number`, is the time to wait before starting the interval.\n * If a `Date`, is the exact time at which to start the interval.\n * @param intervalDuration The delay between each value emitted in the interval. Passing a\n * negative number here will result in immediate completion after the first value is emitted, as though\n * no `intervalDuration` was passed at all.\n * @param scheduler The scheduler to use to schedule the delay. Defaults to {@link asyncScheduler}.\n */\nexport function timer(startDue: number | Date, intervalDuration: number, scheduler?: SchedulerLike): Observable<number>;\n\n/**\n * @deprecated The signature allowing `undefined` to be passed for `intervalDuration` will be removed in v8. Use the `timer(dueTime, scheduler?)` signature instead.\n */\nexport function timer(dueTime: number | Date, unused: undefined, scheduler?: SchedulerLike): Observable<0>;\n\nexport function timer(\n  dueTime: number | Date = 0,\n  intervalOrScheduler?: number | SchedulerLike,\n  scheduler: SchedulerLike = asyncScheduler\n): Observable<number> {\n  // Since negative intervalDuration is treated as though no\n  // interval was specified at all, we start with a negative number.\n  let intervalDuration = -1;\n\n  if (intervalOrScheduler != null) {\n    // If we have a second argument, and it's a scheduler,\n    // override the scheduler we had defaulted. Otherwise,\n    // it must be an interval.\n    if (isScheduler(intervalOrScheduler)) {\n      scheduler = intervalOrScheduler;\n    } else {\n      // Note that this *could* be negative, in which case\n      // it's like not passing an intervalDuration at all.\n      intervalDuration = intervalOrScheduler;\n    }\n  }\n\n  return new Observable((subscriber) => {\n    // If a valid date is passed, calculate how long to wait before\n    // executing the first value... otherwise, if it's a number just schedule\n    // that many milliseconds (or scheduler-specified unit size) in the future.\n    let due = isValidDate(dueTime) ? +dueTime - scheduler!.now() : dueTime;\n\n    if (due < 0) {\n      // Ensure we don't schedule in the future.\n      due = 0;\n    }\n\n    // The incrementing value we emit.\n    let n = 0;\n\n    // Start the timer.\n    return scheduler.schedule(function () {\n      if (!subscriber.closed) {\n        // Emit the next value and increment.\n        subscriber.next(n++);\n\n        if (0 <= intervalDuration) {\n          // If we have a interval after the initial timer,\n          // reschedule with the period.\n          this.schedule(undefined, intervalDuration);\n        } else {\n          // We didn't have an interval. So just complete.\n          subscriber.complete();\n        }\n      }\n    }, due);\n  });\n}\n", "import { Observable } from '../Observable';\nimport { ObservableInput, ObservableInputTuple, SchedulerLike } from '../types';\nimport { mergeAll } from '../operators/mergeAll';\nimport { innerFrom } from './innerFrom';\nimport { EMPTY } from './empty';\nimport { popNumber, popScheduler } from '../util/args';\nimport { from } from './from';\n\nexport function merge<A extends readonly unknown[]>(...sources: [...ObservableInputTuple<A>]): Observable<A[number]>;\nexport function merge<A extends readonly unknown[]>(...sourcesAndConcurrency: [...ObservableInputTuple<A>, number?]): Observable<A[number]>;\n/** @deprecated The `scheduler` parameter will be removed in v8. Use `scheduled` and `mergeAll`. Details: https://rxjs.dev/deprecations/scheduler-argument */\nexport function merge<A extends readonly unknown[]>(\n  ...sourcesAndScheduler: [...ObservableInputTuple<A>, SchedulerLike?]\n): Observable<A[number]>;\n/** @deprecated The `scheduler` parameter will be removed in v8. Use `scheduled` and `mergeAll`. Details: https://rxjs.dev/deprecations/scheduler-argument */\nexport function merge<A extends readonly unknown[]>(\n  ...sourcesAndConcurrencyAndScheduler: [...ObservableInputTuple<A>, number?, SchedulerLike?]\n): Observable<A[number]>;\n\n/**\n * Creates an output Observable which concurrently emits all values from every\n * given input Observable.\n *\n * <span class=\"informal\">Flattens multiple Observables together by blending\n * their values into one Observable.</span>\n *\n * ![](merge.png)\n *\n * `merge` subscribes to each given input Observable (as arguments), and simply\n * forwards (without doing any transformation) all the values from all the input\n * Observables to the output Observable. The output Observable only completes\n * once all input Observables have completed. Any error delivered by an input\n * Observable will be immediately emitted on the output Observable.\n *\n * ## Examples\n *\n * Merge together two Observables: 1s interval and clicks\n *\n * ```ts\n * import { merge, fromEvent, interval } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const timer = interval(1000);\n * const clicksOrTimer = merge(clicks, timer);\n * clicksOrTimer.subscribe(x => console.log(x));\n *\n * // Results in the following:\n * // timer will emit ascending values, one every second(1000ms) to console\n * // clicks logs MouseEvents to console every time the \"document\" is clicked\n * // Since the two streams are merged you see these happening\n * // as they occur.\n * ```\n *\n * Merge together 3 Observables, but run only 2 concurrently\n *\n * ```ts\n * import { interval, take, merge } from 'rxjs';\n *\n * const timer1 = interval(1000).pipe(take(10));\n * const timer2 = interval(2000).pipe(take(6));\n * const timer3 = interval(500).pipe(take(10));\n *\n * const concurrent = 2; // the argument\n * const merged = merge(timer1, timer2, timer3, concurrent);\n * merged.subscribe(x => console.log(x));\n *\n * // Results in the following:\n * // - First timer1 and timer2 will run concurrently\n * // - timer1 will emit a value every 1000ms for 10 iterations\n * // - timer2 will emit a value every 2000ms for 6 iterations\n * // - after timer1 hits its max iteration, timer2 will\n * //   continue, and timer3 will start to run concurrently with timer2\n * // - when timer2 hits its max iteration it terminates, and\n * //   timer3 will continue to emit a value every 500ms until it is complete\n * ```\n *\n * @see {@link mergeAll}\n * @see {@link mergeMap}\n * @see {@link mergeMapTo}\n * @see {@link mergeScan}\n *\n * @param {...ObservableInput} observables Input Observables to merge together.\n * @param {number} [concurrent=Infinity] Maximum number of input\n * Observables being subscribed to concurrently.\n * @param {SchedulerLike} [scheduler=null] The {@link SchedulerLike} to use for managing\n * concurrency of input Observables.\n * @return {Observable} an Observable that emits items that are the result of\n * every input Observable.\n */\nexport function merge(...args: (ObservableInput<unknown> | number | SchedulerLike)[]): Observable<unknown> {\n  const scheduler = popScheduler(args);\n  const concurrent = popNumber(args, Infinity);\n  const sources = args as ObservableInput<unknown>[];\n  return !sources.length\n    ? // No source provided\n      EMPTY\n    : sources.length === 1\n    ? // One source? Just return it.\n      innerFrom(sources[0])\n    : // Merge all sources\n      mergeAll(concurrent)(from(sources, scheduler));\n}\n", "import { Observable } from '../Observable';\nimport { noop } from '../util/noop';\n\n/**\n * An Observable that emits no items to the Observer and never completes.\n *\n * ![](never.png)\n *\n * A simple Observable that emits neither values nor errors nor the completion\n * notification. It can be used for testing purposes or for composing with other\n * Observables. Please note that by never emitting a complete notification, this\n * Observable keeps the subscription from being disposed automatically.\n * Subscriptions need to be manually disposed.\n *\n * ##  Example\n *\n * Emit the number 7, then never emit anything else (not even complete)\n *\n * ```ts\n * import { NEVER, startWith } from 'rxjs';\n *\n * const info = () => console.log('Will not be called');\n *\n * const result = NEVER.pipe(startWith(7));\n * result.subscribe({\n *   next: x => console.log(x),\n *   error: info,\n *   complete: info\n * });\n * ```\n *\n * @see {@link Observable}\n * @see {@link EMPTY}\n * @see {@link of}\n * @see {@link throwError}\n */\nexport const NEVER = new Observable<never>(noop);\n\n/**\n * @deprecated Replaced with the {@link NEVER} constant. Will be removed in v8.\n */\nexport function never() {\n  return NEVER;\n}\n", "const { isArray } = Array;\n\n/**\n * Used in operators and functions that accept either a list of arguments, or an array of arguments\n * as a single argument.\n */\nexport function argsOrArgArray<T>(args: (T | T[])[]): T[] {\n  return args.length === 1 && isArray(args[0]) ? args[0] : (args as T[]);\n}\n", "import { OperatorFunction, MonoTypeOperatorFunction, TruthyTypesOf } from '../types';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\n\n/** @deprecated Use a closure instead of a `thisArg`. Signatures accepting a `thisArg` will be removed in v8. */\nexport function filter<T, S extends T, A>(predicate: (this: A, value: T, index: number) => value is S, thisArg: A): OperatorFunction<T, S>;\nexport function filter<T, S extends T>(predicate: (value: T, index: number) => value is S): OperatorFunction<T, S>;\nexport function filter<T>(predicate: BooleanConstructor): OperatorFunction<T, TruthyTypesOf<T>>;\n/** @deprecated Use a closure instead of a `thisArg`. Signatures accepting a `thisArg` will be removed in v8. */\nexport function filter<T, A>(predicate: (this: A, value: T, index: number) => boolean, thisArg: A): MonoTypeOperatorFunction<T>;\nexport function filter<T>(predicate: (value: T, index: number) => boolean): MonoTypeOperatorFunction<T>;\n\n/**\n * Filter items emitted by the source Observable by only emitting those that\n * satisfy a specified predicate.\n *\n * <span class=\"informal\">Like\n * [Array.prototype.filter()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/filter),\n * it only emits a value from the source if it passes a criterion function.</span>\n *\n * ![](filter.png)\n *\n * Similar to the well-known `Array.prototype.filter` method, this operator\n * takes values from the source Observable, passes them through a `predicate`\n * function and only emits those values that yielded `true`.\n *\n * ## Example\n *\n * Emit only click events whose target was a DIV element\n *\n * ```ts\n * import { fromEvent, filter } from 'rxjs';\n *\n * const div = document.createElement('div');\n * div.style.cssText = 'width: 200px; height: 200px; background: #09c;';\n * document.body.appendChild(div);\n *\n * const clicks = fromEvent(document, 'click');\n * const clicksOnDivs = clicks.pipe(filter(ev => (<HTMLElement>ev.target).tagName === 'DIV'));\n * clicksOnDivs.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link distinct}\n * @see {@link distinctUntilChanged}\n * @see {@link distinctUntilKeyChanged}\n * @see {@link ignoreElements}\n * @see {@link partition}\n * @see {@link skip}\n *\n * @param predicate A function that\n * evaluates each value emitted by the source Observable. If it returns `true`,\n * the value is emitted, if `false` the value is not passed to the output\n * Observable. The `index` parameter is the number `i` for the i-th source\n * emission that has happened since the subscription, starting from the number\n * `0`.\n * @param thisArg An optional argument to determine the value of `this`\n * in the `predicate` function.\n * @return A function that returns an Observable that emits items from the\n * source Observable that satisfy the specified `predicate`.\n */\nexport function filter<T>(predicate: (value: T, index: number) => boolean, thisArg?: any): MonoTypeOperatorFunction<T> {\n  return operate((source, subscriber) => {\n    // An index passed to our predicate function on each call.\n    let index = 0;\n\n    // Subscribe to the source, all errors and completions are\n    // forwarded to the consumer.\n    source.subscribe(\n      // Call the predicate with the appropriate `this` context,\n      // if the predicate returns `true`, then send the value\n      // to the consumer.\n      createOperatorSubscriber(subscriber, (value) => predicate.call(thisArg, value, index++) && subscriber.next(value))\n    );\n  });\n}\n", "import { Observable } from '../Observable';\nimport { ObservableInputTuple } from '../types';\nimport { innerFrom } from './innerFrom';\nimport { argsOrArgArray } from '../util/argsOrArgArray';\nimport { EMPTY } from './empty';\nimport { createOperatorSubscriber } from '../operators/OperatorSubscriber';\nimport { popResultSelector } from '../util/args';\n\nexport function zip<A extends readonly unknown[]>(sources: [...ObservableInputTuple<A>]): Observable<A>;\nexport function zip<A extends readonly unknown[], R>(\n  sources: [...ObservableInputTuple<A>],\n  resultSelector: (...values: A) => R\n): Observable<R>;\nexport function zip<A extends readonly unknown[]>(...sources: [...ObservableInputTuple<A>]): Observable<A>;\nexport function zip<A extends readonly unknown[], R>(\n  ...sourcesAndResultSelector: [...ObservableInputTuple<A>, (...values: A) => R]\n): Observable<R>;\n\n/**\n * Combines multiple Observables to create an Observable whose values are calculated from the values, in order, of each\n * of its input Observables.\n *\n * If the last parameter is a function, this function is used to compute the created value from the input values.\n * Otherwise, an array of the input values is returned.\n *\n * ## Example\n *\n * Combine age and name from different sources\n *\n * ```ts\n * import { of, zip, map } from 'rxjs';\n *\n * const age$ = of(27, 25, 29);\n * const name$ = of('Foo', 'Bar', 'Beer');\n * const isDev$ = of(true, true, false);\n *\n * zip(age$, name$, isDev$).pipe(\n *   map(([age, name, isDev]) => ({ age, name, isDev }))\n * )\n * .subscribe(x => console.log(x));\n *\n * // Outputs\n * // { age: 27, name: 'Foo', isDev: true }\n * // { age: 25, name: 'Bar', isDev: true }\n * // { age: 29, name: 'Beer', isDev: false }\n * ```\n *\n * @param sources\n * @return {Observable<R>}\n */\nexport function zip(...args: unknown[]): Observable<unknown> {\n  const resultSelector = popResultSelector(args);\n\n  const sources = argsOrArgArray(args) as Observable<unknown>[];\n\n  return sources.length\n    ? new Observable<unknown[]>((subscriber) => {\n        // A collection of buffers of values from each source.\n        // Keyed by the same index with which the sources were passed in.\n        let buffers: unknown[][] = sources.map(() => []);\n\n        // An array of flags of whether or not the sources have completed.\n        // This is used to check to see if we should complete the result.\n        // Keyed by the same index with which the sources were passed in.\n        let completed = sources.map(() => false);\n\n        // When everything is done, release the arrays above.\n        subscriber.add(() => {\n          buffers = completed = null!;\n        });\n\n        // Loop over our sources and subscribe to each one. The index `i` is\n        // especially important here, because we use it in closures below to\n        // access the related buffers and completion properties\n        for (let sourceIndex = 0; !subscriber.closed && sourceIndex < sources.length; sourceIndex++) {\n          innerFrom(sources[sourceIndex]).subscribe(\n            createOperatorSubscriber(\n              subscriber,\n              (value) => {\n                buffers[sourceIndex].push(value);\n                // if every buffer has at least one value in it, then we\n                // can shift out the oldest value from each buffer and emit\n                // them as an array.\n                if (buffers.every((buffer) => buffer.length)) {\n                  const result: any = buffers.map((buffer) => buffer.shift()!);\n                  // Emit the array. If theres' a result selector, use that.\n                  subscriber.next(resultSelector ? resultSelector(...result) : result);\n                  // If any one of the sources is both complete and has an empty buffer\n                  // then we complete the result. This is because we cannot possibly have\n                  // any more values to zip together.\n                  if (buffers.some((buffer, i) => !buffer.length && completed[i])) {\n                    subscriber.complete();\n                  }\n                }\n              },\n              () => {\n                // This source completed. Mark it as complete so we can check it later\n                // if we have to.\n                completed[sourceIndex] = true;\n                // But, if this complete source has nothing in its buffer, then we\n                // can complete the result, because we can't possibly have any more\n                // values from this to zip together with the other values.\n                !buffers[sourceIndex].length && subscriber.complete();\n              }\n            )\n          );\n        }\n\n        // When everything is done, release the arrays above.\n        return () => {\n          buffers = completed = null!;\n        };\n      })\n    : EMPTY;\n}\n", "import { Subscriber } from '../Subscriber';\nimport { MonoTypeOperatorFunction, ObservableInput } from '../types';\n\nimport { operate } from '../util/lift';\nimport { innerFrom } from '../observable/innerFrom';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\n\n/**\n * Ignores source values for a duration determined by another Observable, then\n * emits the most recent value from the source Observable, then repeats this\n * process.\n *\n * <span class=\"informal\">It's like {@link auditTime}, but the silencing\n * duration is determined by a second Observable.</span>\n *\n * ![](audit.svg)\n *\n * `audit` is similar to `throttle`, but emits the last value from the silenced\n * time window, instead of the first value. `audit` emits the most recent value\n * from the source Observable on the output Observable as soon as its internal\n * timer becomes disabled, and ignores source values while the timer is enabled.\n * Initially, the timer is disabled. As soon as the first source value arrives,\n * the timer is enabled by calling the `durationSelector` function with the\n * source value, which returns the \"duration\" Observable. When the duration\n * Observable emits a value, the timer is disabled, then the most\n * recent source value is emitted on the output Observable, and this process\n * repeats for the next source value.\n *\n * ## Example\n *\n * Emit clicks at a rate of at most one click per second\n *\n * ```ts\n * import { fromEvent, audit, interval } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const result = clicks.pipe(audit(ev => interval(1000)));\n * result.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link auditTime}\n * @see {@link debounce}\n * @see {@link delayWhen}\n * @see {@link sample}\n * @see {@link throttle}\n *\n * @param durationSelector A function\n * that receives a value from the source Observable, for computing the silencing\n * duration, returned as an Observable or a Promise.\n * @return A function that returns an Observable that performs rate-limiting of\n * emissions from the source Observable.\n */\nexport function audit<T>(durationSelector: (value: T) => ObservableInput<any>): MonoTypeOperatorFunction<T> {\n  return operate((source, subscriber) => {\n    let hasValue = false;\n    let lastValue: T | null = null;\n    let durationSubscriber: Subscriber<any> | null = null;\n    let isComplete = false;\n\n    const endDuration = () => {\n      durationSubscriber?.unsubscribe();\n      durationSubscriber = null;\n      if (hasValue) {\n        hasValue = false;\n        const value = lastValue!;\n        lastValue = null;\n        subscriber.next(value);\n      }\n      isComplete && subscriber.complete();\n    };\n\n    const cleanupDuration = () => {\n      durationSubscriber = null;\n      isComplete && subscriber.complete();\n    };\n\n    source.subscribe(\n      createOperatorSubscriber(\n        subscriber,\n        (value) => {\n          hasValue = true;\n          lastValue = value;\n          if (!durationSubscriber) {\n            innerFrom(durationSelector(value)).subscribe(\n              (durationSubscriber = createOperatorSubscriber(subscriber, endDuration, cleanupDuration))\n            );\n          }\n        },\n        () => {\n          isComplete = true;\n          (!hasValue || !durationSubscriber || durationSubscriber.closed) && subscriber.complete();\n        }\n      )\n    );\n  });\n}\n", "import { asyncScheduler } from '../scheduler/async';\nimport { audit } from './audit';\nimport { timer } from '../observable/timer';\nimport { MonoTypeOperatorFunction, SchedulerLike } from '../types';\n\n/**\n * Ignores source values for `duration` milliseconds, then emits the most recent\n * value from the source Observable, then repeats this process.\n *\n * <span class=\"informal\">When it sees a source value, it ignores that plus\n * the next ones for `duration` milliseconds, and then it emits the most recent\n * value from the source.</span>\n *\n * ![](auditTime.png)\n *\n * `auditTime` is similar to `throttleTime`, but emits the last value from the\n * silenced time window, instead of the first value. `auditTime` emits the most\n * recent value from the source Observable on the output Observable as soon as\n * its internal timer becomes disabled, and ignores source values while the\n * timer is enabled. Initially, the timer is disabled. As soon as the first\n * source value arrives, the timer is enabled. After `duration` milliseconds (or\n * the time unit determined internally by the optional `scheduler`) has passed,\n * the timer is disabled, then the most recent source value is emitted on the\n * output Observable, and this process repeats for the next source value.\n * Optionally takes a {@link SchedulerLike} for managing timers.\n *\n * ## Example\n *\n * Emit clicks at a rate of at most one click per second\n *\n * ```ts\n * import { fromEvent, auditTime } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const result = clicks.pipe(auditTime(1000));\n * result.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link audit}\n * @see {@link debounceTime}\n * @see {@link delay}\n * @see {@link sampleTime}\n * @see {@link throttleTime}\n *\n * @param {number} duration Time to wait before emitting the most recent source\n * value, measured in milliseconds or the time unit determined internally\n * by the optional `scheduler`.\n * @param {SchedulerLike} [scheduler=async] The {@link SchedulerLike} to use for\n * managing the timers that handle the rate-limiting behavior.\n * @return A function that returns an Observable that performs rate-limiting of\n * emissions from the source Observable.\n */\nexport function auditTime<T>(duration: number, scheduler: SchedulerLike = asyncScheduler): MonoTypeOperatorFunction<T> {\n  return audit(() => timer(duration, scheduler));\n}\n", "import { OperatorFunction } from '../types';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\nimport { arrRemove } from '../util/arrRemove';\n\n/**\n * Buffers the source Observable values until the size hits the maximum\n * `bufferSize` given.\n *\n * <span class=\"informal\">Collects values from the past as an array, and emits\n * that array only when its size reaches `bufferSize`.</span>\n *\n * ![](bufferCount.png)\n *\n * Buffers a number of values from the source Observable by `bufferSize` then\n * emits the buffer and clears it, and starts a new buffer each\n * `startBufferEvery` values. If `startBufferEvery` is not provided or is\n * `null`, then new buffers are started immediately at the start of the source\n * and when each buffer closes and is emitted.\n *\n * ## Examples\n *\n * Emit the last two click events as an array\n *\n * ```ts\n * import { fromEvent, bufferCount } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const buffered = clicks.pipe(bufferCount(2));\n * buffered.subscribe(x => console.log(x));\n * ```\n *\n * On every click, emit the last two click events as an array\n *\n * ```ts\n * import { fromEvent, bufferCount } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const buffered = clicks.pipe(bufferCount(2, 1));\n * buffered.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link buffer}\n * @see {@link bufferTime}\n * @see {@link bufferToggle}\n * @see {@link bufferWhen}\n * @see {@link pairwise}\n * @see {@link windowCount}\n *\n * @param {number} bufferSize The maximum size of the buffer emitted.\n * @param {number} [startBufferEvery] Interval at which to start a new buffer.\n * For example if `startBufferEvery` is `2`, then a new buffer will be started\n * on every other value from the source. A new buffer is started at the\n * beginning of the source by default.\n * @return A function that returns an Observable of arrays of buffered values.\n */\nexport function bufferCount<T>(bufferSize: number, startBufferEvery: number | null = null): OperatorFunction<T, T[]> {\n  // If no `startBufferEvery` value was supplied, then we're\n  // opening and closing on the bufferSize itself.\n  startBufferEvery = startBufferEvery ?? bufferSize;\n\n  return operate((source, subscriber) => {\n    let buffers: T[][] = [];\n    let count = 0;\n\n    source.subscribe(\n      createOperatorSubscriber(\n        subscriber,\n        (value) => {\n          let toEmit: T[][] | null = null;\n\n          // Check to see if we need to start a buffer.\n          // This will start one at the first value, and then\n          // a new one every N after that.\n          if (count++ % startBufferEvery! === 0) {\n            buffers.push([]);\n          }\n\n          // Push our value into our active buffers.\n          for (const buffer of buffers) {\n            buffer.push(value);\n            // Check to see if we're over the bufferSize\n            // if we are, record it so we can emit it later.\n            // If we emitted it now and removed it, it would\n            // mutate the `buffers` array while we're looping\n            // over it.\n            if (bufferSize <= buffer.length) {\n              toEmit = toEmit ?? [];\n              toEmit.push(buffer);\n            }\n          }\n\n          if (toEmit) {\n            // We have found some buffers that are over the\n            // `bufferSize`. Emit them, and remove them from our\n            // buffers list.\n            for (const buffer of toEmit) {\n              arrRemove(buffers, buffer);\n              subscriber.next(buffer);\n            }\n          }\n        },\n        () => {\n          // When the source completes, emit all of our\n          // active buffers.\n          for (const buffer of buffers) {\n            subscriber.next(buffer);\n          }\n          subscriber.complete();\n        },\n        // Pass all errors through to consumer.\n        undefined,\n        () => {\n          // Clean up our memory when we finalize\n          buffers = null!;\n        }\n      )\n    );\n  });\n}\n", "import { Observable } from '../Observable';\n\nimport { ObservableInput, OperatorFunction, ObservedValueOf } from '../types';\nimport { Subscription } from '../Subscription';\nimport { innerFrom } from '../observable/innerFrom';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\nimport { operate } from '../util/lift';\n\n/* tslint:disable:max-line-length */\nexport function catchError<T, O extends ObservableInput<any>>(\n  selector: (err: any, caught: Observable<T>) => O\n): OperatorFunction<T, T | ObservedValueOf<O>>;\n/* tslint:enable:max-line-length */\n\n/**\n * Catches errors on the observable to be handled by returning a new observable or throwing an error.\n *\n * <span class=\"informal\">\n * It only listens to the error channel and ignores notifications.\n * Handles errors from the source observable, and maps them to a new observable.\n * The error may also be rethrown, or a new error can be thrown to emit an error from the result.\n * </span>\n *\n * ![](catch.png)\n *\n * This operator handles errors, but forwards along all other events to the resulting observable.\n * If the source observable terminates with an error, it will map that error to a new observable,\n * subscribe to it, and forward all of its events to the resulting observable.\n *\n * ## Examples\n *\n * Continue with a different Observable when there's an error\n *\n * ```ts\n * import { of, map, catchError } from 'rxjs';\n *\n * of(1, 2, 3, 4, 5)\n *   .pipe(\n *     map(n => {\n *       if (n === 4) {\n *         throw 'four!';\n *       }\n *       return n;\n *     }),\n *     catchError(err => of('I', 'II', 'III', 'IV', 'V'))\n *   )\n *   .subscribe(x => console.log(x));\n *   // 1, 2, 3, I, II, III, IV, V\n * ```\n *\n * Retry the caught source Observable again in case of error, similar to `retry()` operator\n *\n * ```ts\n * import { of, map, catchError, take } from 'rxjs';\n *\n * of(1, 2, 3, 4, 5)\n *   .pipe(\n *     map(n => {\n *       if (n === 4) {\n *         throw 'four!';\n *       }\n *       return n;\n *     }),\n *     catchError((err, caught) => caught),\n *     take(30)\n *   )\n *   .subscribe(x => console.log(x));\n *   // 1, 2, 3, 1, 2, 3, ...\n * ```\n *\n * Throw a new error when the source Observable throws an error\n *\n * ```ts\n * import { of, map, catchError } from 'rxjs';\n *\n * of(1, 2, 3, 4, 5)\n *   .pipe(\n *     map(n => {\n *       if (n === 4) {\n *         throw 'four!';\n *       }\n *       return n;\n *     }),\n *     catchError(err => {\n *       throw 'error in source. Details: ' + err;\n *     })\n *   )\n *   .subscribe({\n *     next: x => console.log(x),\n *     error: err => console.log(err)\n *   });\n *   // 1, 2, 3, error in source. Details: four!\n * ```\n *\n * @see {@link onErrorResumeNext}\n * @see {@link repeat}\n * @see {@link repeatWhen}\n * @see {@link retry }\n * @see {@link retryWhen}\n *\n * @param {function} selector a function that takes as arguments `err`, which is the error, and `caught`, which\n * is the source observable, in case you'd like to \"retry\" that observable by returning it again. Whatever observable\n * is returned by the `selector` will be used to continue the observable chain.\n * @return A function that returns an Observable that originates from either\n * the source or the Observable returned by the `selector` function.\n */\nexport function catchError<T, O extends ObservableInput<any>>(\n  selector: (err: any, caught: Observable<T>) => O\n): OperatorFunction<T, T | ObservedValueOf<O>> {\n  return operate((source, subscriber) => {\n    let innerSub: Subscription | null = null;\n    let syncUnsub = false;\n    let handledResult: Observable<ObservedValueOf<O>>;\n\n    innerSub = source.subscribe(\n      createOperatorSubscriber(subscriber, undefined, undefined, (err) => {\n        handledResult = innerFrom(selector(err, catchError(selector)(source)));\n        if (innerSub) {\n          innerSub.unsubscribe();\n          innerSub = null;\n          handledResult.subscribe(subscriber);\n        } else {\n          // We don't have an innerSub yet, that means the error was synchronous\n          // because the subscribe call hasn't returned yet.\n          syncUnsub = true;\n        }\n      })\n    );\n\n    if (syncUnsub) {\n      // We have a synchronous error, we need to make sure to\n      // finalize right away. This ensures that callbacks in the `finalize` operator are called\n      // at the right time, and that finalization occurs at the expected\n      // time between the source error and the subscription to the\n      // next observable.\n      innerSub.unsubscribe();\n      innerSub = null;\n      handledResult!.subscribe(subscriber);\n    }\n  });\n}\n", "import { Observable } from '../Observable';\nimport { Subscriber } from '../Subscriber';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\n\n/**\n * A basic scan operation. This is used for `scan` and `reduce`.\n * @param accumulator The accumulator to use\n * @param seed The seed value for the state to accumulate\n * @param hasSeed Whether or not a seed was provided\n * @param emitOnNext Whether or not to emit the state on next\n * @param emitBeforeComplete Whether or not to emit the before completion\n */\n\nexport function scanInternals<V, A, S>(\n  accumulator: (acc: V | A | S, value: V, index: number) => A,\n  seed: S,\n  hasSeed: boolean,\n  emitOnNext: boolean,\n  emitBeforeComplete?: undefined | true\n) {\n  return (source: Observable<V>, subscriber: Subscriber<any>) => {\n    // Whether or not we have state yet. This will only be\n    // false before the first value arrives if we didn't get\n    // a seed value.\n    let hasState = hasSeed;\n    // The state that we're tracking, starting with the seed,\n    // if there is one, and then updated by the return value\n    // from the accumulator on each emission.\n    let state: any = seed;\n    // An index to pass to the accumulator function.\n    let index = 0;\n\n    // Subscribe to our source. All errors and completions are passed through.\n    source.subscribe(\n      createOperatorSubscriber(\n        subscriber,\n        (value) => {\n          // Always increment the index.\n          const i = index++;\n          // Set the state\n          state = hasState\n            ? // We already have state, so we can get the new state from the accumulator\n              accumulator(state, value, i)\n            : // We didn't have state yet, a seed value was not provided, so\n\n              // we set the state to the first value, and mark that we have state now\n              ((hasState = true), value);\n\n          // Maybe send it to the consumer.\n          emitOnNext && subscriber.next(state);\n        },\n        // If an onComplete was given, call it, otherwise\n        // just pass through the complete notification to the consumer.\n        emitBeforeComplete &&\n          (() => {\n            hasState && subscriber.next(state);\n            subscriber.complete();\n          })\n      )\n    );\n  };\n}\n", "import { combineLatestInit } from '../observable/combineLatest';\nimport { ObservableInput, ObservableInputTuple, OperatorFunction } from '../types';\nimport { operate } from '../util/lift';\nimport { argsOrArgArray } from '../util/argsOrArgArray';\nimport { mapOneOrManyArgs } from '../util/mapOneOrManyArgs';\nimport { pipe } from '../util/pipe';\nimport { popResultSelector } from '../util/args';\n\n/** @deprecated Replaced with {@link combineLatestWith}. Will be removed in v8. */\nexport function combineLatest<T, A extends readonly unknown[], R>(\n  sources: [...ObservableInputTuple<A>],\n  project: (...values: [T, ...A]) => R\n): OperatorFunction<T, R>;\n/** @deprecated Replaced with {@link combineLatestWith}. Will be removed in v8. */\nexport function combineLatest<T, A extends readonly unknown[], R>(sources: [...ObservableInputTuple<A>]): OperatorFunction<T, [T, ...A]>;\n\n/** @deprecated Replaced with {@link combineLatestWith}. Will be removed in v8. */\nexport function combineLatest<T, A extends readonly unknown[], R>(\n  ...sourcesAndProject: [...ObservableInputTuple<A>, (...values: [T, ...A]) => R]\n): OperatorFunction<T, R>;\n/** @deprecated Replaced with {@link combineLatestWith}. Will be removed in v8. */\nexport function combineLatest<T, A extends readonly unknown[], R>(...sources: [...ObservableInputTuple<A>]): OperatorFunction<T, [T, ...A]>;\n\n/**\n * @deprecated Replaced with {@link combineLatestWith}. Will be removed in v8.\n */\nexport function combineLatest<T, R>(...args: (ObservableInput<any> | ((...values: any[]) => R))[]): OperatorFunction<T, unknown> {\n  const resultSelector = popResultSelector(args);\n  return resultSelector\n    ? pipe(combineLatest(...(args as Array<ObservableInput<any>>)), mapOneOrManyArgs(resultSelector))\n    : operate((source, subscriber) => {\n        combineLatestInit([source, ...argsOrArgArray(args)])(subscriber);\n      });\n}\n", "import { ObservableInputTuple, OperatorFunction, Cons } from '../types';\nimport { combineLatest } from './combineLatest';\n\n/**\n * Create an observable that combines the latest values from all passed observables and the source\n * into arrays and emits them.\n *\n * Returns an observable, that when subscribed to, will subscribe to the source observable and all\n * sources provided as arguments. Once all sources emit at least one value, all of the latest values\n * will be emitted as an array. After that, every time any source emits a value, all of the latest values\n * will be emitted as an array.\n *\n * This is a useful operator for eagerly calculating values based off of changed inputs.\n *\n * ## Example\n *\n * Simple concatenation of values from two inputs\n *\n * ```ts\n * import { fromEvent, combineLatestWith, map } from 'rxjs';\n *\n * // Setup: Add two inputs to the page\n * const input1 = document.createElement('input');\n * document.body.appendChild(input1);\n * const input2 = document.createElement('input');\n * document.body.appendChild(input2);\n *\n * // Get streams of changes\n * const input1Changes$ = fromEvent(input1, 'change');\n * const input2Changes$ = fromEvent(input2, 'change');\n *\n * // Combine the changes by adding them together\n * input1Changes$.pipe(\n *   combineLatestWith(input2Changes$),\n *   map(([e1, e2]) => (<HTMLInputElement>e1.target).value + ' - ' + (<HTMLInputElement>e2.target).value)\n * )\n * .subscribe(x => console.log(x));\n * ```\n *\n * @param otherSources the other sources to subscribe to.\n * @return A function that returns an Observable that emits the latest\n * emissions from both source and provided Observables.\n */\nexport function combineLatestWith<T, A extends readonly unknown[]>(\n  ...otherSources: [...ObservableInputTuple<A>]\n): OperatorFunction<T, Cons<T, A>> {\n  return combineLatest(...otherSources);\n}\n", "import { Subscriber } from '../Subscriber';\nimport { MonoTypeOperatorFunction, ObservableInput } from '../types';\nimport { operate } from '../util/lift';\nimport { noop } from '../util/noop';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\nimport { innerFrom } from '../observable/innerFrom';\n\n/**\n * Emits a notification from the source Observable only after a particular time span\n * determined by another Observable has passed without another source emission.\n *\n * <span class=\"informal\">It's like {@link debounceTime}, but the time span of\n * emission silence is determined by a second Observable.</span>\n *\n * ![](debounce.svg)\n *\n * `debounce` delays notifications emitted by the source Observable, but drops previous\n * pending delayed emissions if a new notification arrives on the source Observable.\n * This operator keeps track of the most recent notification from the source\n * Observable, and spawns a duration Observable by calling the\n * `durationSelector` function. The notification is emitted only when the duration\n * Observable emits a next notification, and if no other notification was emitted on\n * the source Observable since the duration Observable was spawned. If a new\n * notification appears before the duration Observable emits, the previous notification will\n * not be emitted and a new duration is scheduled from `durationSelector` is scheduled.\n * If the completing event happens during the scheduled duration the last cached notification\n * is emitted before the completion event is forwarded to the output observable.\n * If the error event happens during the scheduled duration or after it only the error event is\n * forwarded to the output observable. The cache notification is not emitted in this case.\n *\n * Like {@link debounceTime}, this is a rate-limiting operator, and also a\n * delay-like operator since output emissions do not necessarily occur at the\n * same time as they did on the source Observable.\n *\n * ## Example\n *\n * Emit the most recent click after a burst of clicks\n *\n * ```ts\n * import { fromEvent, scan, debounce, interval } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const result = clicks.pipe(\n *   scan(i => ++i, 1),\n *   debounce(i => interval(200 * i))\n * );\n * result.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link audit}\n * @see {@link auditTime}\n * @see {@link debounceTime}\n * @see {@link delay}\n * @see {@link sample}\n * @see {@link sampleTime}\n * @see {@link throttle}\n * @see {@link throttleTime}\n *\n * @param durationSelector A function\n * that receives a value from the source Observable, for computing the timeout\n * duration for each source value, returned as an Observable or a Promise.\n * @return A function that returns an Observable that delays the emissions of\n * the source Observable by the specified duration Observable returned by\n * `durationSelector`, and may drop some values if they occur too frequently.\n */\nexport function debounce<T>(durationSelector: (value: T) => ObservableInput<any>): MonoTypeOperatorFunction<T> {\n  return operate((source, subscriber) => {\n    let hasValue = false;\n    let lastValue: T | null = null;\n    // The subscriber/subscription for the current debounce, if there is one.\n    let durationSubscriber: Subscriber<any> | null = null;\n\n    const emit = () => {\n      // Unsubscribe any current debounce subscription we have,\n      // we only cared about the first notification from it, and we\n      // want to clean that subscription up as soon as possible.\n      durationSubscriber?.unsubscribe();\n      durationSubscriber = null;\n      if (hasValue) {\n        // We have a value! Free up memory first, then emit the value.\n        hasValue = false;\n        const value = lastValue!;\n        lastValue = null;\n        subscriber.next(value);\n      }\n    };\n\n    source.subscribe(\n      createOperatorSubscriber(\n        subscriber,\n        (value: T) => {\n          // Cancel any pending debounce duration. We don't\n          // need to null it out here yet tho, because we're just going\n          // to create another one in a few lines.\n          durationSubscriber?.unsubscribe();\n          hasValue = true;\n          lastValue = value;\n          // Capture our duration subscriber, so we can unsubscribe it when we're notified\n          // and we're going to emit the value.\n          durationSubscriber = createOperatorSubscriber(subscriber, emit, noop);\n          // Subscribe to the duration.\n          innerFrom(durationSelector(value)).subscribe(durationSubscriber);\n        },\n        () => {\n          // Source completed.\n          // Emit any pending debounced values then complete\n          emit();\n          subscriber.complete();\n        },\n        // Pass all errors through to consumer\n        undefined,\n        () => {\n          // Finalization.\n          lastValue = durationSubscriber = null;\n        }\n      )\n    );\n  });\n}\n", "import { asyncScheduler } from '../scheduler/async';\nimport { Subscription } from '../Subscription';\nimport { MonoTypeOperatorFunction, SchedulerAction, SchedulerLike } from '../types';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\n\n/**\n * Emits a notification from the source Observable only after a particular time span\n * has passed without another source emission.\n *\n * <span class=\"informal\">It's like {@link delay}, but passes only the most\n * recent notification from each burst of emissions.</span>\n *\n * ![](debounceTime.png)\n *\n * `debounceTime` delays notifications emitted by the source Observable, but drops\n * previous pending delayed emissions if a new notification arrives on the source\n * Observable. This operator keeps track of the most recent notification from the\n * source Observable, and emits that only when `dueTime` has passed\n * without any other notification appearing on the source Observable. If a new value\n * appears before `dueTime` silence occurs, the previous notification will be dropped\n * and will not be emitted and a new `dueTime` is scheduled.\n * If the completing event happens during `dueTime` the last cached notification\n * is emitted before the completion event is forwarded to the output observable.\n * If the error event happens during `dueTime` or after it only the error event is\n * forwarded to the output observable. The cache notification is not emitted in this case.\n *\n * This is a rate-limiting operator, because it is impossible for more than one\n * notification to be emitted in any time window of duration `dueTime`, but it is also\n * a delay-like operator since output emissions do not occur at the same time as\n * they did on the source Observable. Optionally takes a {@link SchedulerLike} for\n * managing timers.\n *\n * ## Example\n *\n * Emit the most recent click after a burst of clicks\n *\n * ```ts\n * import { fromEvent, debounceTime } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const result = clicks.pipe(debounceTime(1000));\n * result.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link audit}\n * @see {@link auditTime}\n * @see {@link debounce}\n * @see {@link sample}\n * @see {@link sampleTime}\n * @see {@link throttle}\n * @see {@link throttleTime}\n *\n * @param {number} dueTime The timeout duration in milliseconds (or the time\n * unit determined internally by the optional `scheduler`) for the window of\n * time required to wait for emission silence before emitting the most recent\n * source value.\n * @param {SchedulerLike} [scheduler=async] The {@link SchedulerLike} to use for\n * managing the timers that handle the timeout for each value.\n * @return A function that returns an Observable that delays the emissions of\n * the source Observable by the specified `dueTime`, and may drop some values\n * if they occur too frequently.\n */\nexport function debounceTime<T>(dueTime: number, scheduler: SchedulerLike = asyncScheduler): MonoTypeOperatorFunction<T> {\n  return operate((source, subscriber) => {\n    let activeTask: Subscription | null = null;\n    let lastValue: T | null = null;\n    let lastTime: number | null = null;\n\n    const emit = () => {\n      if (activeTask) {\n        // We have a value! Free up memory first, then emit the value.\n        activeTask.unsubscribe();\n        activeTask = null;\n        const value = lastValue!;\n        lastValue = null;\n        subscriber.next(value);\n      }\n    };\n    function emitWhenIdle(this: SchedulerAction<unknown>) {\n      // This is called `dueTime` after the first value\n      // but we might have received new values during this window!\n\n      const targetTime = lastTime! + dueTime;\n      const now = scheduler.now();\n      if (now < targetTime) {\n        // On that case, re-schedule to the new target\n        activeTask = this.schedule(undefined, targetTime - now);\n        subscriber.add(activeTask);\n        return;\n      }\n\n      emit();\n    }\n\n    source.subscribe(\n      createOperatorSubscriber(\n        subscriber,\n        (value: T) => {\n          lastValue = value;\n          lastTime = scheduler.now();\n\n          // Only set up a task if it's not already up\n          if (!activeTask) {\n            activeTask = scheduler.schedule(emitWhenIdle, dueTime);\n            subscriber.add(activeTask);\n          }\n        },\n        () => {\n          // Source completed.\n          // Emit any pending debounced values then complete\n          emit();\n          subscriber.complete();\n        },\n        // Pass all errors through to consumer.\n        undefined,\n        () => {\n          // Finalization.\n          lastValue = activeTask = null;\n        }\n      )\n    );\n  });\n}\n", "import { OperatorFunction } from '../types';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\n\n/**\n * Emits a given value if the source Observable completes without emitting any\n * `next` value, otherwise mirrors the source Observable.\n *\n * <span class=\"informal\">If the source Observable turns out to be empty, then\n * this operator will emit a default value.</span>\n *\n * ![](defaultIfEmpty.png)\n *\n * `defaultIfEmpty` emits the values emitted by the source Observable or a\n * specified default value if the source Observable is empty (completes without\n * having emitted any `next` value).\n *\n * ## Example\n *\n * If no clicks happen in 5 seconds, then emit 'no clicks'\n *\n * ```ts\n * import { fromEvent, takeUntil, interval, defaultIfEmpty } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const clicksBeforeFive = clicks.pipe(takeUntil(interval(5000)));\n * const result = clicksBeforeFive.pipe(defaultIfEmpty('no clicks'));\n * result.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link empty}\n * @see {@link last}\n *\n * @param defaultValue The default value used if the source\n * Observable is empty.\n * @return A function that returns an Observable that emits either the\n * specified `defaultValue` if the source Observable emits no items, or the\n * values emitted by the source Observable.\n */\nexport function defaultIfEmpty<T, R>(defaultValue: R): OperatorFunction<T, T | R> {\n  return operate((source, subscriber) => {\n    let hasValue = false;\n    source.subscribe(\n      createOperatorSubscriber(\n        subscriber,\n        (value) => {\n          hasValue = true;\n          subscriber.next(value);\n        },\n        () => {\n          if (!hasValue) {\n            subscriber.next(defaultValue!);\n          }\n          subscriber.complete();\n        }\n      )\n    );\n  });\n}\n", "import { MonoTypeOperatorFunction } from '../types';\nimport { EMPTY } from '../observable/empty';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\n\n/**\n * Emits only the first `count` values emitted by the source Observable.\n *\n * <span class=\"informal\">Takes the first `count` values from the source, then\n * completes.</span>\n *\n * ![](take.png)\n *\n * `take` returns an Observable that emits only the first `count` values emitted\n * by the source Observable. If the source emits fewer than `count` values then\n * all of its values are emitted. After that, it completes, regardless if the\n * source completes.\n *\n * ## Example\n *\n * Take the first 5 seconds of an infinite 1-second interval Observable\n *\n * ```ts\n * import { interval, take } from 'rxjs';\n *\n * const intervalCount = interval(1000);\n * const takeFive = intervalCount.pipe(take(5));\n * takeFive.subscribe(x => console.log(x));\n *\n * // Logs:\n * // 0\n * // 1\n * // 2\n * // 3\n * // 4\n * ```\n *\n * @see {@link takeLast}\n * @see {@link takeUntil}\n * @see {@link takeWhile}\n * @see {@link skip}\n *\n * @param count The maximum number of `next` values to emit.\n * @return A function that returns an Observable that emits only the first\n * `count` values emitted by the source Observable, or all of the values from\n * the source if the source emits fewer than `count` values.\n */\nexport function take<T>(count: number): MonoTypeOperatorFunction<T> {\n  return count <= 0\n    ? // If we are taking no values, that's empty.\n      () => EMPTY\n    : operate((source, subscriber) => {\n        let seen = 0;\n        source.subscribe(\n          createOperatorSubscriber(subscriber, (value) => {\n            // Increment the number of values we have seen,\n            // then check it against the allowed count to see\n            // if we are still letting values through.\n            if (++seen <= count) {\n              subscriber.next(value);\n              // If we have met or passed our allowed count,\n              // we need to complete. We have to do <= here,\n              // because re-entrant code will increment `seen` twice.\n              if (count <= seen) {\n                subscriber.complete();\n              }\n            }\n          })\n        );\n      });\n}\n", "import { OperatorFunction } from '../types';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\nimport { noop } from '../util/noop';\n\n/**\n * Ignores all items emitted by the source Observable and only passes calls of `complete` or `error`.\n *\n * ![](ignoreElements.png)\n *\n * The `ignoreElements` operator suppresses all items emitted by the source Observable,\n * but allows its termination notification (either `error` or `complete`) to pass through unchanged.\n *\n * If you do not care about the items being emitted by an Observable, but you do want to be notified\n * when it completes or when it terminates with an error, you can apply the `ignoreElements` operator\n * to the Observable, which will ensure that it will never call its observers\u2019 `next` handlers.\n *\n * ## Example\n *\n * Ignore all `next` emissions from the source\n *\n * ```ts\n * import { of, ignoreElements } from 'rxjs';\n *\n * of('you', 'talking', 'to', 'me')\n *   .pipe(ignoreElements())\n *   .subscribe({\n *     next: word => console.log(word),\n *     error: err => console.log('error:', err),\n *     complete: () => console.log('the end'),\n *   });\n *\n * // result:\n * // 'the end'\n * ```\n *\n * @return A function that returns an empty Observable that only calls\n * `complete` or `error`, based on which one is called by the source\n * Observable.\n */\nexport function ignoreElements(): OperatorFunction<unknown, never> {\n  return operate((source, subscriber) => {\n    source.subscribe(createOperatorSubscriber(subscriber, noop));\n  });\n}\n", "import { OperatorFunction } from '../types';\nimport { map } from './map';\n\n/** @deprecated To be removed in v9. Use {@link map} instead: `map(() => value)`. */\nexport function mapTo<R>(value: R): OperatorFunction<unknown, R>;\n/**\n * @deprecated Do not specify explicit type parameters. Signatures with type parameters\n * that cannot be inferred will be removed in v8. `mapTo` itself will be removed in v9,\n * use {@link map} instead: `map(() => value)`.\n * */\nexport function mapTo<T, R>(value: R): OperatorFunction<T, R>;\n\n/**\n * Emits the given constant value on the output Observable every time the source\n * Observable emits a value.\n *\n * <span class=\"informal\">Like {@link map}, but it maps every source value to\n * the same output value every time.</span>\n *\n * ![](mapTo.png)\n *\n * Takes a constant `value` as argument, and emits that whenever the source\n * Observable emits a value. In other words, ignores the actual source value,\n * and simply uses the emission moment to know when to emit the given `value`.\n *\n * ## Example\n *\n * Map every click to the string `'Hi'`\n *\n * ```ts\n * import { fromEvent, mapTo } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const greetings = clicks.pipe(mapTo('Hi'));\n *\n * greetings.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link map}\n *\n * @param value The value to map each source value to.\n * @return A function that returns an Observable that emits the given `value`\n * every time the source Observable emits.\n * @deprecated To be removed in v9. Use {@link map} instead: `map(() => value)`.\n */\nexport function mapTo<R>(value: R): OperatorFunction<unknown, R> {\n  return map(() => value);\n}\n", "import { Observable } from '../Observable';\nimport { MonoTypeOperatorFunction, ObservableInput } from '../types';\nimport { concat } from '../observable/concat';\nimport { take } from './take';\nimport { ignoreElements } from './ignoreElements';\nimport { mapTo } from './mapTo';\nimport { mergeMap } from './mergeMap';\nimport { innerFrom } from '../observable/innerFrom';\n\n/** @deprecated The `subscriptionDelay` parameter will be removed in v8. */\nexport function delayWhen<T>(\n  delayDurationSelector: (value: T, index: number) => ObservableInput<any>,\n  subscriptionDelay: Observable<any>\n): MonoTypeOperatorFunction<T>;\nexport function delayWhen<T>(delayDurationSelector: (value: T, index: number) => ObservableInput<any>): MonoTypeOperatorFunction<T>;\n\n/**\n * Delays the emission of items from the source Observable by a given time span\n * determined by the emissions of another Observable.\n *\n * <span class=\"informal\">It's like {@link delay}, but the time span of the\n * delay duration is determined by a second Observable.</span>\n *\n * ![](delayWhen.png)\n *\n * `delayWhen` operator shifts each emitted value from the source Observable by\n * a time span determined by another Observable. When the source emits a value,\n * the `delayDurationSelector` function is called with the value emitted from\n * the source Observable as the first argument to the `delayDurationSelector`.\n * The `delayDurationSelector` function should return an {@link ObservableInput},\n * that is internally converted to an Observable that is called the \"duration\"\n * Observable.\n *\n * The source value is emitted on the output Observable only when the \"duration\"\n * Observable emits ({@link guide/glossary-and-semantics#next next}s) any value.\n * Upon that, the \"duration\" Observable gets unsubscribed.\n *\n * Before RxJS V7, the {@link guide/glossary-and-semantics#complete completion}\n * of the \"duration\" Observable would have been triggering the emission of the\n * source value to the output Observable, but with RxJS V7, this is not the case\n * anymore.\n *\n * Only next notifications (from the \"duration\" Observable) trigger values from\n * the source Observable to be passed to the output Observable. If the \"duration\"\n * Observable only emits the complete notification (without next), the value\n * emitted by the source Observable will never get to the output Observable - it\n * will be swallowed. If the \"duration\" Observable errors, the error will be\n * propagated to the output Observable.\n *\n * Optionally, `delayWhen` takes a second argument, `subscriptionDelay`, which\n * is an Observable. When `subscriptionDelay` emits its first value or\n * completes, the source Observable is subscribed to and starts behaving like\n * described in the previous paragraph. If `subscriptionDelay` is not provided,\n * `delayWhen` will subscribe to the source Observable as soon as the output\n * Observable is subscribed.\n *\n * ## Example\n *\n * Delay each click by a random amount of time, between 0 and 5 seconds\n *\n * ```ts\n * import { fromEvent, delayWhen, interval } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const delayedClicks = clicks.pipe(\n *   delayWhen(() => interval(Math.random() * 5000))\n * );\n * delayedClicks.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link delay}\n * @see {@link throttle}\n * @see {@link throttleTime}\n * @see {@link debounce}\n * @see {@link debounceTime}\n * @see {@link sample}\n * @see {@link sampleTime}\n * @see {@link audit}\n * @see {@link auditTime}\n *\n * @param delayDurationSelector A function that returns an `ObservableInput` for\n * each `value` emitted by the source Observable, which is then used to delay the\n * emission of that `value` on the output Observable until the `ObservableInput`\n * returned from this function emits a next value. When called, beside `value`,\n * this function receives a zero-based `index` of the emission order.\n * @param subscriptionDelay An Observable that triggers the subscription to the\n * source Observable once it emits any value.\n * @return A function that returns an Observable that delays the emissions of\n * the source Observable by an amount of time specified by the Observable\n * returned by `delayDurationSelector`.\n */\nexport function delayWhen<T>(\n  delayDurationSelector: (value: T, index: number) => ObservableInput<any>,\n  subscriptionDelay?: Observable<any>\n): MonoTypeOperatorFunction<T> {\n  if (subscriptionDelay) {\n    // DEPRECATED PATH\n    return (source: Observable<T>) =>\n      concat(subscriptionDelay.pipe(take(1), ignoreElements()), source.pipe(delayWhen(delayDurationSelector)));\n  }\n\n  return mergeMap((value, index) => innerFrom(delayDurationSelector(value, index)).pipe(take(1), mapTo(value)));\n}\n", "import { asyncScheduler } from '../scheduler/async';\nimport { MonoTypeOperatorFunction, SchedulerLike } from '../types';\nimport { delayWhen } from './delayWhen';\nimport { timer } from '../observable/timer';\n\n/**\n * Delays the emission of items from the source Observable by a given timeout or\n * until a given Date.\n *\n * <span class=\"informal\">Time shifts each item by some specified amount of\n * milliseconds.</span>\n *\n * ![](delay.svg)\n *\n * If the delay argument is a Number, this operator time shifts the source\n * Observable by that amount of time expressed in milliseconds. The relative\n * time intervals between the values are preserved.\n *\n * If the delay argument is a Date, this operator time shifts the start of the\n * Observable execution until the given date occurs.\n *\n * ## Examples\n *\n * Delay each click by one second\n *\n * ```ts\n * import { fromEvent, delay } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const delayedClicks = clicks.pipe(delay(1000)); // each click emitted after 1 second\n * delayedClicks.subscribe(x => console.log(x));\n * ```\n *\n * Delay all clicks until a future date happens\n *\n * ```ts\n * import { fromEvent, delay } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const date = new Date('March 15, 2050 12:00:00'); // in the future\n * const delayedClicks = clicks.pipe(delay(date)); // click emitted only after that date\n * delayedClicks.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link delayWhen}\n * @see {@link throttle}\n * @see {@link throttleTime}\n * @see {@link debounce}\n * @see {@link debounceTime}\n * @see {@link sample}\n * @see {@link sampleTime}\n * @see {@link audit}\n * @see {@link auditTime}\n *\n * @param {number|Date} due The delay duration in milliseconds (a `number`) or\n * a `Date` until which the emission of the source items is delayed.\n * @param {SchedulerLike} [scheduler=async] The {@link SchedulerLike} to use for\n * managing the timers that handle the time-shift for each item.\n * @return A function that returns an Observable that delays the emissions of\n * the source Observable by the specified timeout or Date.\n */\nexport function delay<T>(due: number | Date, scheduler: SchedulerLike = asyncScheduler): MonoTypeOperatorFunction<T> {\n  const duration = timer(due, scheduler);\n  return delayWhen(() => duration);\n}\n", "import { MonoTypeOperatorFunction } from '../types';\nimport { identity } from '../util/identity';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\n\nexport function distinctUntilChanged<T>(comparator?: (previous: T, current: T) => boolean): MonoTypeOperatorFunction<T>;\nexport function distinctUntilChanged<T, K>(\n  comparator: (previous: K, current: K) => boolean,\n  keySelector: (value: T) => K\n): MonoTypeOperatorFunction<T>;\n\n/**\n * Returns a result {@link Observable} that emits all values pushed by the source observable if they\n * are distinct in comparison to the last value the result observable emitted.\n *\n * When provided without parameters or with the first parameter (`{@link distinctUntilChanged#comparator comparator}`),\n * it behaves like this:\n *\n * 1. It will always emit the first value from the source.\n * 2. For all subsequent values pushed by the source, they will be compared to the previously emitted values\n *    using the provided `comparator` or an `===` equality check.\n * 3. If the value pushed by the source is determined to be unequal by this check, that value is emitted and\n *    becomes the new \"previously emitted value\" internally.\n *\n * When the second parameter (`{@link distinctUntilChanged#keySelector keySelector}`) is provided, the behavior\n * changes:\n *\n * 1. It will always emit the first value from the source.\n * 2. The `keySelector` will be run against all values, including the first value.\n * 3. For all values after the first, the selected key will be compared against the key selected from\n *    the previously emitted value using the `comparator`.\n * 4. If the keys are determined to be unequal by this check, the value (not the key), is emitted\n *    and the selected key from that value is saved for future comparisons against other keys.\n *\n * ## Examples\n *\n * A very basic example with no `{@link distinctUntilChanged#comparator comparator}`. Note that `1` is emitted more than once,\n * because it's distinct in comparison to the _previously emitted_ value,\n * not in comparison to _all other emitted values_.\n *\n * ```ts\n * import { of, distinctUntilChanged } from 'rxjs';\n *\n * of(1, 1, 1, 2, 2, 2, 1, 1, 3, 3)\n *   .pipe(distinctUntilChanged())\n *   .subscribe(console.log);\n * // Logs: 1, 2, 1, 3\n * ```\n *\n * With a `{@link distinctUntilChanged#comparator comparator}`, you can do custom comparisons. Let's say\n * you only want to emit a value when all of its components have\n * changed:\n *\n * ```ts\n * import { of, distinctUntilChanged } from 'rxjs';\n *\n * const totallyDifferentBuilds$ = of(\n *   { engineVersion: '1.1.0', transmissionVersion: '1.2.0' },\n *   { engineVersion: '1.1.0', transmissionVersion: '1.4.0' },\n *   { engineVersion: '1.3.0', transmissionVersion: '1.4.0' },\n *   { engineVersion: '1.3.0', transmissionVersion: '1.5.0' },\n *   { engineVersion: '2.0.0', transmissionVersion: '1.5.0' }\n * ).pipe(\n *   distinctUntilChanged((prev, curr) => {\n *     return (\n *       prev.engineVersion === curr.engineVersion ||\n *       prev.transmissionVersion === curr.transmissionVersion\n *     );\n *   })\n * );\n *\n * totallyDifferentBuilds$.subscribe(console.log);\n *\n * // Logs:\n * // { engineVersion: '1.1.0', transmissionVersion: '1.2.0' }\n * // { engineVersion: '1.3.0', transmissionVersion: '1.4.0' }\n * // { engineVersion: '2.0.0', transmissionVersion: '1.5.0' }\n * ```\n *\n * You can also provide a custom `{@link distinctUntilChanged#comparator comparator}` to check that emitted\n * changes are only in one direction. Let's say you only want to get\n * the next record temperature:\n *\n * ```ts\n * import { of, distinctUntilChanged } from 'rxjs';\n *\n * const temps$ = of(30, 31, 20, 34, 33, 29, 35, 20);\n *\n * const recordHighs$ = temps$.pipe(\n *   distinctUntilChanged((prevHigh, temp) => {\n *     // If the current temp is less than\n *     // or the same as the previous record,\n *     // the record hasn't changed.\n *     return temp <= prevHigh;\n *   })\n * );\n *\n * recordHighs$.subscribe(console.log);\n * // Logs: 30, 31, 34, 35\n * ```\n *\n * Selecting update events only when the `updatedBy` field shows\n * the account changed hands.\n *\n * ```ts\n * import { of, distinctUntilChanged } from 'rxjs';\n *\n * // A stream of updates to a given account\n * const accountUpdates$ = of(\n *   { updatedBy: 'blesh', data: [] },\n *   { updatedBy: 'blesh', data: [] },\n *   { updatedBy: 'ncjamieson', data: [] },\n *   { updatedBy: 'ncjamieson', data: [] },\n *   { updatedBy: 'blesh', data: [] }\n * );\n *\n * // We only want the events where it changed hands\n * const changedHands$ = accountUpdates$.pipe(\n *   distinctUntilChanged(undefined, update => update.updatedBy)\n * );\n *\n * changedHands$.subscribe(console.log);\n * // Logs:\n * // { updatedBy: 'blesh', data: Array[0] }\n * // { updatedBy: 'ncjamieson', data: Array[0] }\n * // { updatedBy: 'blesh', data: Array[0] }\n * ```\n *\n * @see {@link distinct}\n * @see {@link distinctUntilKeyChanged}\n *\n * @param comparator A function used to compare the previous and current keys for\n * equality. Defaults to a `===` check.\n * @param keySelector Used to select a key value to be passed to the `comparator`.\n *\n * @return A function that returns an Observable that emits items from the\n * source Observable with distinct values.\n */\nexport function distinctUntilChanged<T, K>(\n  comparator?: (previous: K, current: K) => boolean,\n  keySelector: (value: T) => K = identity as (value: T) => K\n): MonoTypeOperatorFunction<T> {\n  // We've been allowing `null` do be passed as the `compare`, so we can't do\n  // a default value for the parameter, because that will only work\n  // for `undefined`.\n  comparator = comparator ?? defaultCompare;\n\n  return operate((source, subscriber) => {\n    // The previous key, used to compare against keys selected\n    // from new arrivals to determine \"distinctiveness\".\n    let previousKey: K;\n    // Whether or not this is the first value we've gotten.\n    let first = true;\n\n    source.subscribe(\n      createOperatorSubscriber(subscriber, (value) => {\n        // We always call the key selector.\n        const currentKey = keySelector(value);\n\n        // If it's the first value, we always emit it.\n        // Otherwise, we compare this key to the previous key, and\n        // if the comparer returns false, we emit.\n        if (first || !comparator!(previousKey, currentKey)) {\n          // Update our state *before* we emit the value\n          // as emission can be the source of re-entrant code\n          // in functional libraries like this. We only really\n          // need to do this if it's the first value, or if the\n          // key we're tracking in previous needs to change.\n          first = false;\n          previousKey = currentKey;\n\n          // Emit the value!\n          subscriber.next(value);\n        }\n      })\n    );\n  });\n}\n\nfunction defaultCompare(a: any, b: any) {\n  return a === b;\n}\n", "import { distinctUntilChanged } from './distinctUntilChanged';\nimport { MonoTypeOperatorFunction } from '../types';\n\n/* tslint:disable:max-line-length */\nexport function distinctUntilKeyChanged<T>(key: keyof T): MonoTypeOperatorFunction<T>;\nexport function distinctUntilKeyChanged<T, K extends keyof T>(key: K, compare: (x: T[K], y: T[K]) => boolean): MonoTypeOperatorFunction<T>;\n/* tslint:enable:max-line-length */\n\n/**\n * Returns an Observable that emits all items emitted by the source Observable that are distinct by comparison from the previous item,\n * using a property accessed by using the key provided to check if the two items are distinct.\n *\n * If a comparator function is provided, then it will be called for each item to test for whether or not that value should be emitted.\n *\n * If a comparator function is not provided, an equality check is used by default.\n *\n * ## Examples\n *\n * An example comparing the name of persons\n *\n * ```ts\n * import { of, distinctUntilKeyChanged } from 'rxjs';\n *\n * of(\n *   { age: 4, name: 'Foo' },\n *   { age: 7, name: 'Bar' },\n *   { age: 5, name: 'Foo' },\n *   { age: 6, name: 'Foo' }\n * ).pipe(\n *   distinctUntilKeyChanged('name')\n * )\n * .subscribe(x => console.log(x));\n *\n * // displays:\n * // { age: 4, name: 'Foo' }\n * // { age: 7, name: 'Bar' }\n * // { age: 5, name: 'Foo' }\n * ```\n *\n * An example comparing the first letters of the name\n *\n * ```ts\n * import { of, distinctUntilKeyChanged } from 'rxjs';\n *\n * of(\n *   { age: 4, name: 'Foo1' },\n *   { age: 7, name: 'Bar' },\n *   { age: 5, name: 'Foo2' },\n *   { age: 6, name: 'Foo3' }\n * ).pipe(\n *   distinctUntilKeyChanged('name', (x, y) => x.substring(0, 3) === y.substring(0, 3))\n * )\n * .subscribe(x => console.log(x));\n *\n * // displays:\n * // { age: 4, name: 'Foo1' }\n * // { age: 7, name: 'Bar' }\n * // { age: 5, name: 'Foo2' }\n * ```\n *\n * @see {@link distinct}\n * @see {@link distinctUntilChanged}\n *\n * @param {string} key String key for object property lookup on each item.\n * @param {function} [compare] Optional comparison function called to test if an item is distinct from the previous item in the source.\n * @return A function that returns an Observable that emits items from the\n * source Observable with distinct values based on the key specified.\n */\nexport function distinctUntilKeyChanged<T, K extends keyof T>(key: K, compare?: (x: T[K], y: T[K]) => boolean): MonoTypeOperatorFunction<T> {\n  return distinctUntilChanged((x: T, y: T) => compare ? compare(x[key], y[key]) : x[key] === y[key]);\n}\n", "import { EmptyError } from '../util/EmptyError';\nimport { MonoTypeOperatorFunction } from '../types';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\n\n/**\n * If the source observable completes without emitting a value, it will emit\n * an error. The error will be created at that time by the optional\n * `errorFactory` argument, otherwise, the error will be {@link EmptyError}.\n *\n * ![](throwIfEmpty.png)\n *\n * ## Example\n *\n * Throw an error if the document wasn't clicked within 1 second\n *\n * ```ts\n * import { fromEvent, takeUntil, timer, throwIfEmpty } from 'rxjs';\n *\n * const click$ = fromEvent(document, 'click');\n *\n * click$.pipe(\n *   takeUntil(timer(1000)),\n *   throwIfEmpty(() => new Error('The document was not clicked within 1 second'))\n * )\n * .subscribe({\n *   next() {\n *    console.log('The document was clicked');\n *   },\n *   error(err) {\n *     console.error(err.message);\n *   }\n * });\n * ```\n *\n * @param errorFactory A factory function called to produce the\n * error to be thrown when the source observable completes without emitting a\n * value.\n * @return A function that returns an Observable that throws an error if the\n * source Observable completed without emitting.\n */\nexport function throwIfEmpty<T>(errorFactory: () => any = defaultErrorFactory): MonoTypeOperatorFunction<T> {\n  return operate((source, subscriber) => {\n    let hasValue = false;\n    source.subscribe(\n      createOperatorSubscriber(\n        subscriber,\n        (value) => {\n          hasValue = true;\n          subscriber.next(value);\n        },\n        () => (hasValue ? subscriber.complete() : subscriber.error(errorFactory()))\n      )\n    );\n  });\n}\n\nfunction defaultErrorFactory() {\n  return new EmptyError();\n}\n", "/** prettier */\nimport { Observable } from '../Observable';\nimport { concat } from '../observable/concat';\nimport { of } from '../observable/of';\nimport { MonoTypeOperatorFunction, SchedulerLike, OperatorFunction, ValueFromArray } from '../types';\n\n/** @deprecated The `scheduler` parameter will be removed in v8. Use `scheduled` and `concatAll`. Details: https://rxjs.dev/deprecations/scheduler-argument */\nexport function endWith<T>(scheduler: SchedulerLike): MonoTypeOperatorFunction<T>;\n/** @deprecated The `scheduler` parameter will be removed in v8. Use `scheduled` and `concatAll`. Details: https://rxjs.dev/deprecations/scheduler-argument */\nexport function endWith<T, A extends unknown[] = T[]>(\n  ...valuesAndScheduler: [...A, SchedulerLike]\n): OperatorFunction<T, T | ValueFromArray<A>>;\n\nexport function endWith<T, A extends unknown[] = T[]>(...values: A): OperatorFunction<T, T | ValueFromArray<A>>;\n\n/**\n * Returns an observable that will emit all values from the source, then synchronously emit\n * the provided value(s) immediately after the source completes.\n *\n * NOTE: Passing a last argument of a Scheduler is _deprecated_, and may result in incorrect\n * types in TypeScript.\n *\n * This is useful for knowing when an observable ends. Particularly when paired with an\n * operator like {@link takeUntil}\n *\n * ![](endWith.png)\n *\n * ## Example\n *\n * Emit values to know when an interval starts and stops. The interval will\n * stop when a user clicks anywhere on the document.\n *\n * ```ts\n * import { interval, map, fromEvent, startWith, takeUntil, endWith } from 'rxjs';\n *\n * const ticker$ = interval(5000).pipe(\n *   map(() => 'tick')\n * );\n *\n * const documentClicks$ = fromEvent(document, 'click');\n *\n * ticker$.pipe(\n *   startWith('interval started'),\n *   takeUntil(documentClicks$),\n *   endWith('interval ended by click')\n * )\n * .subscribe(x => console.log(x));\n *\n * // Result (assuming a user clicks after 15 seconds)\n * // 'interval started'\n * // 'tick'\n * // 'tick'\n * // 'tick'\n * // 'interval ended by click'\n * ```\n *\n * @see {@link startWith}\n * @see {@link concat}\n * @see {@link takeUntil}\n *\n * @param values Items you want the modified Observable to emit last.\n * @return A function that returns an Observable that emits all values from the\n * source, then synchronously emits the provided value(s) immediately after the\n * source completes.\n */\nexport function endWith<T>(...values: Array<T | SchedulerLike>): MonoTypeOperatorFunction<T> {\n  return (source: Observable<T>) => concat(source, of(...values)) as Observable<T>;\n}\n", "import { MonoTypeOperatorFunction } from '../types';\nimport { operate } from '../util/lift';\n\n/**\n * Returns an Observable that mirrors the source Observable, but will call a specified function when\n * the source terminates on complete or error.\n * The specified function will also be called when the subscriber explicitly unsubscribes.\n *\n * ## Examples\n *\n * Execute callback function when the observable completes\n *\n * ```ts\n * import { interval, take, finalize } from 'rxjs';\n *\n * // emit value in sequence every 1 second\n * const source = interval(1000);\n * const example = source.pipe(\n *   take(5), //take only the first 5 values\n *   finalize(() => console.log('Sequence complete')) // Execute when the observable completes\n * );\n * const subscribe = example.subscribe(val => console.log(val));\n *\n * // results:\n * // 0\n * // 1\n * // 2\n * // 3\n * // 4\n * // 'Sequence complete'\n * ```\n *\n * Execute callback function when the subscriber explicitly unsubscribes\n *\n * ```ts\n * import { interval, finalize, tap, noop, timer } from 'rxjs';\n *\n * const source = interval(100).pipe(\n *   finalize(() => console.log('[finalize] Called')),\n *   tap({\n *     next: () => console.log('[next] Called'),\n *     error: () => console.log('[error] Not called'),\n *     complete: () => console.log('[tap complete] Not called')\n *   })\n * );\n *\n * const sub = source.subscribe({\n *   next: x => console.log(x),\n *   error: noop,\n *   complete: () => console.log('[complete] Not called')\n * });\n *\n * timer(150).subscribe(() => sub.unsubscribe());\n *\n * // results:\n * // '[next] Called'\n * // 0\n * // '[finalize] Called'\n * ```\n *\n * @param {function} callback Function to be called when source terminates.\n * @return A function that returns an Observable that mirrors the source, but\n * will call the specified function on termination.\n */\nexport function finalize<T>(callback: () => void): MonoTypeOperatorFunction<T> {\n  return operate((source, subscriber) => {\n    // TODO: This try/finally was only added for `useDeprecatedSynchronousErrorHandling`.\n    // REMOVE THIS WHEN THAT HOT GARBAGE IS REMOVED IN V8.\n    try {\n      source.subscribe(subscriber);\n    } finally {\n      subscriber.add(callback);\n    }\n  });\n}\n", "import { Observable } from '../Observable';\nimport { EmptyError } from '../util/EmptyError';\nimport { OperatorFunction, TruthyTypesOf } from '../types';\nimport { filter } from './filter';\nimport { take } from './take';\nimport { defaultIfEmpty } from './defaultIfEmpty';\nimport { throwIfEmpty } from './throwIfEmpty';\nimport { identity } from '../util/identity';\n\nexport function first<T, D = T>(predicate?: null, defaultValue?: D): OperatorFunction<T, T | D>;\nexport function first<T>(predicate: BooleanConstructor): OperatorFunction<T, TruthyTypesOf<T>>;\nexport function first<T, D>(predicate: BooleanConstructor, defaultValue: D): OperatorFunction<T, TruthyTypesOf<T> | D>;\nexport function first<T, S extends T>(\n  predicate: (value: T, index: number, source: Observable<T>) => value is S,\n  defaultValue?: S\n): OperatorFunction<T, S>;\nexport function first<T, S extends T, D>(\n  predicate: (value: T, index: number, source: Observable<T>) => value is S,\n  defaultValue: D\n): OperatorFunction<T, S | D>;\nexport function first<T, D = T>(\n  predicate: (value: T, index: number, source: Observable<T>) => boolean,\n  defaultValue?: D\n): OperatorFunction<T, T | D>;\n\n/**\n * Emits only the first value (or the first value that meets some condition)\n * emitted by the source Observable.\n *\n * <span class=\"informal\">Emits only the first value. Or emits only the first\n * value that passes some test.</span>\n *\n * ![](first.png)\n *\n * If called with no arguments, `first` emits the first value of the source\n * Observable, then completes. If called with a `predicate` function, `first`\n * emits the first value of the source that matches the specified condition. Throws an error if\n * `defaultValue` was not provided and a matching element is not found.\n *\n * ## Examples\n *\n * Emit only the first click that happens on the DOM\n *\n * ```ts\n * import { fromEvent, first } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const result = clicks.pipe(first());\n * result.subscribe(x => console.log(x));\n * ```\n *\n * Emits the first click that happens on a DIV\n *\n * ```ts\n * import { fromEvent, first } from 'rxjs';\n *\n * const div = document.createElement('div');\n * div.style.cssText = 'width: 200px; height: 200px; background: #09c;';\n * document.body.appendChild(div);\n *\n * const clicks = fromEvent(document, 'click');\n * const result = clicks.pipe(first(ev => (<HTMLElement>ev.target).tagName === 'DIV'));\n * result.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link filter}\n * @see {@link find}\n * @see {@link take}\n *\n * @throws {EmptyError} Delivers an EmptyError to the Observer's `error`\n * callback if the Observable completes before any `next` notification was sent.\n * This is how `first()` is different from {@link take}(1) which completes instead.\n *\n * @param {function(value: T, index: number, source: Observable<T>): boolean} [predicate]\n * An optional function called with each item to test for condition matching.\n * @param {D} [defaultValue] The default value emitted in case no valid value\n * was found on the source.\n * @return A function that returns an Observable that emits the first item that\n * matches the condition.\n */\nexport function first<T, D>(\n  predicate?: ((value: T, index: number, source: Observable<T>) => boolean) | null,\n  defaultValue?: D\n): OperatorFunction<T, T | D> {\n  const hasDefaultValue = arguments.length >= 2;\n  return (source: Observable<T>) =>\n    source.pipe(\n      predicate ? filter((v, i) => predicate(v, i, source)) : identity,\n      take(1),\n      hasDefaultValue ? defaultIfEmpty(defaultValue!) : throwIfEmpty(() => new EmptyError())\n    );\n}\n", "import { EMPTY } from '../observable/empty';\nimport { MonoTypeOperatorFunction } from '../types';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\n\n/**\n * Waits for the source to complete, then emits the last N values from the source,\n * as specified by the `count` argument.\n *\n * ![](takeLast.png)\n *\n * `takeLast` results in an observable that will hold values up to `count` values in memory,\n * until the source completes. It then pushes all values in memory to the consumer, in the\n * order they were received from the source, then notifies the consumer that it is\n * complete.\n *\n * If for some reason the source completes before the `count` supplied to `takeLast` is reached,\n * all values received until that point are emitted, and then completion is notified.\n *\n * **Warning**: Using `takeLast` with an observable that never completes will result\n * in an observable that never emits a value.\n *\n * ## Example\n *\n * Take the last 3 values of an Observable with many values\n *\n * ```ts\n * import { range, takeLast } from 'rxjs';\n *\n * const many = range(1, 100);\n * const lastThree = many.pipe(takeLast(3));\n * lastThree.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link take}\n * @see {@link takeUntil}\n * @see {@link takeWhile}\n * @see {@link skip}\n *\n * @param count The maximum number of values to emit from the end of\n * the sequence of values emitted by the source Observable.\n * @return A function that returns an Observable that emits at most the last\n * `count` values emitted by the source Observable.\n */\nexport function takeLast<T>(count: number): MonoTypeOperatorFunction<T> {\n  return count <= 0\n    ? () => EMPTY\n    : operate((source, subscriber) => {\n        // This buffer will hold the values we are going to emit\n        // when the source completes. Since we only want to take the\n        // last N values, we can't emit until we're sure we're not getting\n        // any more values.\n        let buffer: T[] = [];\n        source.subscribe(\n          createOperatorSubscriber(\n            subscriber,\n            (value) => {\n              // Add the most recent value onto the end of our buffer.\n              buffer.push(value);\n              // If our buffer is now larger than the number of values we\n              // want to take, we remove the oldest value from the buffer.\n              count < buffer.length && buffer.shift();\n            },\n            () => {\n              // The source completed, we now know what are last values\n              // are, emit them in the order they were received.\n              for (const value of buffer) {\n                subscriber.next(value);\n              }\n              subscriber.complete();\n            },\n            // Errors are passed through to the consumer\n            undefined,\n            () => {\n              // During finalization release the values in our buffer.\n              buffer = null!;\n            }\n          )\n        );\n      });\n}\n", "import { ObservableInput, ObservableInputTuple, OperatorFunction, SchedulerLike } from '../types';\nimport { operate } from '../util/lift';\nimport { argsOrArgArray } from '../util/argsOrArgArray';\nimport { mergeAll } from './mergeAll';\nimport { popNumber, popScheduler } from '../util/args';\nimport { from } from '../observable/from';\n\n/** @deprecated Replaced with {@link mergeWith}. Will be removed in v8. */\nexport function merge<T, A extends readonly unknown[]>(...sources: [...ObservableInputTuple<A>]): OperatorFunction<T, T | A[number]>;\n/** @deprecated Replaced with {@link mergeWith}. Will be removed in v8. */\nexport function merge<T, A extends readonly unknown[]>(\n  ...sourcesAndConcurrency: [...ObservableInputTuple<A>, number]\n): OperatorFunction<T, T | A[number]>;\n/** @deprecated Replaced with {@link mergeWith}. Will be removed in v8. */\nexport function merge<T, A extends readonly unknown[]>(\n  ...sourcesAndScheduler: [...ObservableInputTuple<A>, SchedulerLike]\n): OperatorFunction<T, T | A[number]>;\n/** @deprecated Replaced with {@link mergeWith}. Will be removed in v8. */\nexport function merge<T, A extends readonly unknown[]>(\n  ...sourcesAndConcurrencyAndScheduler: [...ObservableInputTuple<A>, number, SchedulerLike]\n): OperatorFunction<T, T | A[number]>;\n\nexport function merge<T>(...args: unknown[]): OperatorFunction<T, unknown> {\n  const scheduler = popScheduler(args);\n  const concurrent = popNumber(args, Infinity);\n  args = argsOrArgArray(args);\n\n  return operate((source, subscriber) => {\n    mergeAll(concurrent)(from([source, ...(args as ObservableInput<T>[])], scheduler)).subscribe(subscriber);\n  });\n}\n", "import { ObservableInputTuple, OperatorFunction } from '../types';\nimport { merge } from './merge';\n\n/**\n * Merge the values from all observables to a single observable result.\n *\n * Creates an observable, that when subscribed to, subscribes to the source\n * observable, and all other sources provided as arguments. All values from\n * every source are emitted from the resulting subscription.\n *\n * When all sources complete, the resulting observable will complete.\n *\n * When any source errors, the resulting observable will error.\n *\n * ## Example\n *\n * Joining all outputs from multiple user input event streams\n *\n * ```ts\n * import { fromEvent, map, mergeWith } from 'rxjs';\n *\n * const clicks$ = fromEvent(document, 'click').pipe(map(() => 'click'));\n * const mousemoves$ = fromEvent(document, 'mousemove').pipe(map(() => 'mousemove'));\n * const dblclicks$ = fromEvent(document, 'dblclick').pipe(map(() => 'dblclick'));\n *\n * mousemoves$\n *   .pipe(mergeWith(clicks$, dblclicks$))\n *   .subscribe(x => console.log(x));\n *\n * // result (assuming user interactions)\n * // 'mousemove'\n * // 'mousemove'\n * // 'mousemove'\n * // 'click'\n * // 'click'\n * // 'dblclick'\n * ```\n *\n * @see {@link merge}\n *\n * @param otherSources the sources to combine the current source with.\n * @return A function that returns an Observable that merges the values from\n * all given Observables.\n */\nexport function mergeWith<T, A extends readonly unknown[]>(\n  ...otherSources: [...ObservableInputTuple<A>]\n): OperatorFunction<T, T | A[number]> {\n  return merge(...otherSources);\n}\n", "import { Subscription } from '../Subscription';\nimport { EMPTY } from '../observable/empty';\nimport { operate } from '../util/lift';\nimport { MonoTypeOperatorFunction, ObservableInput } from '../types';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\nimport { innerFrom } from '../observable/innerFrom';\nimport { timer } from '../observable/timer';\n\nexport interface RepeatConfig {\n  /**\n   * The number of times to repeat the source. Defaults to `Infinity`.\n   */\n  count?: number;\n\n  /**\n   * If a `number`, will delay the repeat of the source by that number of milliseconds.\n   * If a function, it will provide the number of times the source has been subscribed to,\n   * and the return value should be a valid observable input that will notify when the source\n   * should be repeated. If the notifier observable is empty, the result will complete.\n   */\n  delay?: number | ((count: number) => ObservableInput<any>);\n}\n\n/**\n * Returns an Observable that will resubscribe to the source stream when the source stream completes.\n *\n * <span class=\"informal\">Repeats all values emitted on the source. It's like {@link retry}, but for non error cases.</span>\n *\n * ![](repeat.png)\n *\n * Repeat will output values from a source until the source completes, then it will resubscribe to the\n * source a specified number of times, with a specified delay. Repeat can be particularly useful in\n * combination with closing operators like {@link take}, {@link takeUntil}, {@link first}, or {@link takeWhile},\n * as it can be used to restart a source again from scratch.\n *\n * Repeat is very similar to {@link retry}, where {@link retry} will resubscribe to the source in the error case, but\n * `repeat` will resubscribe if the source completes.\n *\n * Note that `repeat` will _not_ catch errors. Use {@link retry} for that.\n *\n * - `repeat(0)` returns an empty observable\n * - `repeat()` will repeat forever\n * - `repeat({ delay: 200 })` will repeat forever, with a delay of 200ms between repetitions.\n * - `repeat({ count: 2, delay: 400 })` will repeat twice, with a delay of 400ms between repetitions.\n * - `repeat({ delay: (count) => timer(count * 1000) })` will repeat forever, but will have a delay that grows by one second for each repetition.\n *\n * ## Example\n *\n * Repeat a message stream\n *\n * ```ts\n * import { of, repeat } from 'rxjs';\n *\n * const source = of('Repeat message');\n * const result = source.pipe(repeat(3));\n *\n * result.subscribe(x => console.log(x));\n *\n * // Results\n * // 'Repeat message'\n * // 'Repeat message'\n * // 'Repeat message'\n * ```\n *\n * Repeat 3 values, 2 times\n *\n * ```ts\n * import { interval, take, repeat } from 'rxjs';\n *\n * const source = interval(1000);\n * const result = source.pipe(take(3), repeat(2));\n *\n * result.subscribe(x => console.log(x));\n *\n * // Results every second\n * // 0\n * // 1\n * // 2\n * // 0\n * // 1\n * // 2\n * ```\n *\n * Defining two complex repeats with delays on the same source.\n * Note that the second repeat cannot be called until the first\n * repeat as exhausted it's count.\n *\n * ```ts\n * import { defer, of, repeat } from 'rxjs';\n *\n * const source = defer(() => {\n *    return of(`Hello, it is ${new Date()}`)\n * });\n *\n * source.pipe(\n *    // Repeat 3 times with a delay of 1 second between repetitions\n *    repeat({\n *      count: 3,\n *      delay: 1000,\n *    }),\n *\n *    // *Then* repeat forever, but with an exponential step-back\n *    // maxing out at 1 minute.\n *    repeat({\n *      delay: (count) => timer(Math.min(60000, 2 ^ count * 1000))\n *    })\n * )\n * ```\n *\n * @see {@link repeatWhen}\n * @see {@link retry}\n *\n * @param count The number of times the source Observable items are repeated, a count of 0 will yield\n * an empty Observable.\n */\nexport function repeat<T>(countOrConfig?: number | RepeatConfig): MonoTypeOperatorFunction<T> {\n  let count = Infinity;\n  let delay: RepeatConfig['delay'];\n\n  if (countOrConfig != null) {\n    if (typeof countOrConfig === 'object') {\n      ({ count = Infinity, delay } = countOrConfig);\n    } else {\n      count = countOrConfig;\n    }\n  }\n\n  return count <= 0\n    ? () => EMPTY\n    : operate((source, subscriber) => {\n        let soFar = 0;\n        let sourceSub: Subscription | null;\n\n        const resubscribe = () => {\n          sourceSub?.unsubscribe();\n          sourceSub = null;\n          if (delay != null) {\n            const notifier = typeof delay === 'number' ? timer(delay) : innerFrom(delay(soFar));\n            const notifierSubscriber = createOperatorSubscriber(subscriber, () => {\n              notifierSubscriber.unsubscribe();\n              subscribeToSource();\n            });\n            notifier.subscribe(notifierSubscriber);\n          } else {\n            subscribeToSource();\n          }\n        };\n\n        const subscribeToSource = () => {\n          let syncUnsub = false;\n          sourceSub = source.subscribe(\n            createOperatorSubscriber(subscriber, undefined, () => {\n              if (++soFar < count) {\n                if (sourceSub) {\n                  resubscribe();\n                } else {\n                  syncUnsub = true;\n                }\n              } else {\n                subscriber.complete();\n              }\n            })\n          );\n\n          if (syncUnsub) {\n            resubscribe();\n          }\n        };\n\n        subscribeToSource();\n      });\n}\n", "import { OperatorFunction } from '../types';\nimport { operate } from '../util/lift';\nimport { scanInternals } from './scanInternals';\n\nexport function scan<V, A = V>(accumulator: (acc: A | V, value: V, index: number) => A): OperatorFunction<V, V | A>;\nexport function scan<V, A>(accumulator: (acc: A, value: V, index: number) => A, seed: A): OperatorFunction<V, A>;\nexport function scan<V, A, S>(accumulator: (acc: A | S, value: V, index: number) => A, seed: S): OperatorFunction<V, A>;\n\n// TODO: link to a \"redux pattern\" section in the guide (location TBD)\n\n/**\n * Useful for encapsulating and managing state. Applies an accumulator (or \"reducer function\")\n * to each value from the source after an initial state is established -- either via\n * a `seed` value (second argument), or from the first value from the source.\n *\n * <span class=\"informal\">It's like {@link reduce}, but emits the current\n * accumulation state after each update</span>\n *\n * ![](scan.png)\n *\n * This operator maintains an internal state and emits it after processing each value as follows:\n *\n * 1. First value arrives\n *   - If a `seed` value was supplied (as the second argument to `scan`), let `state = seed` and `value = firstValue`.\n *   - If NO `seed` value was supplied (no second argument), let `state = firstValue` and go to 3.\n * 2. Let `state = accumulator(state, value)`.\n *   - If an error is thrown by `accumulator`, notify the consumer of an error. The process ends.\n * 3. Emit `state`.\n * 4. Next value arrives, let `value = nextValue`, go to 2.\n *\n * ## Examples\n *\n * An average of previous numbers. This example shows how\n * not providing a `seed` can prime the stream with the\n * first value from the source.\n *\n * ```ts\n * import { of, scan, map } from 'rxjs';\n *\n * const numbers$ = of(1, 2, 3);\n *\n * numbers$\n *   .pipe(\n *     // Get the sum of the numbers coming in.\n *     scan((total, n) => total + n),\n *     // Get the average by dividing the sum by the total number\n *     // received so far (which is 1 more than the zero-based index).\n *     map((sum, index) => sum / (index + 1))\n *   )\n *   .subscribe(console.log);\n * ```\n *\n * The Fibonacci sequence. This example shows how you can use\n * a seed to prime accumulation process. Also... you know... Fibonacci.\n * So important to like, computers and stuff that its whiteboarded\n * in job interviews. Now you can show them the Rx version! (Please don't, haha)\n *\n * ```ts\n * import { interval, scan, map, startWith } from 'rxjs';\n *\n * const firstTwoFibs = [0, 1];\n * // An endless stream of Fibonacci numbers.\n * const fibonacci$ = interval(1000).pipe(\n *   // Scan to get the fibonacci numbers (after 0, 1)\n *   scan(([a, b]) => [b, a + b], firstTwoFibs),\n *   // Get the second number in the tuple, it's the one you calculated\n *   map(([, n]) => n),\n *   // Start with our first two digits :)\n *   startWith(...firstTwoFibs)\n * );\n *\n * fibonacci$.subscribe(console.log);\n * ```\n *\n * @see {@link expand}\n * @see {@link mergeScan}\n * @see {@link reduce}\n * @see {@link switchScan}\n *\n * @param accumulator A \"reducer function\". This will be called for each value after an initial state is\n * acquired.\n * @param seed The initial state. If this is not provided, the first value from the source will\n * be used as the initial state, and emitted without going through the accumulator. All subsequent values\n * will be processed by the accumulator function. If this is provided, all values will go through\n * the accumulator function.\n * @return A function that returns an Observable of the accumulated values.\n */\nexport function scan<V, A, S>(accumulator: (acc: V | A | S, value: V, index: number) => A, seed?: S): OperatorFunction<V, V | A> {\n  // providing a seed of `undefined` *should* be valid and trigger\n  // hasSeed! so don't use `seed !== undefined` checks!\n  // For this reason, we have to check it here at the original call site\n  // otherwise inside Operator/Subscriber we won't know if `undefined`\n  // means they didn't provide anything or if they literally provided `undefined`\n  return operate(scanInternals(accumulator, seed as S, arguments.length >= 2, true));\n}\n", "import { innerFrom } from '../observable/innerFrom';\nimport { Subject } from '../Subject';\nimport { SafeSubscriber } from '../Subscriber';\nimport { Subscription } from '../Subscription';\nimport { MonoTypeOperatorFunction, SubjectLike, ObservableInput } from '../types';\nimport { operate } from '../util/lift';\n\nexport interface ShareConfig<T> {\n  /**\n   * The factory used to create the subject that will connect the source observable to\n   * multicast consumers.\n   */\n  connector?: () => SubjectLike<T>;\n  /**\n   * If `true`, the resulting observable will reset internal state on error from source and return to a \"cold\" state. This\n   * allows the resulting observable to be \"retried\" in the event of an error.\n   * If `false`, when an error comes from the source it will push the error into the connecting subject, and the subject\n   * will remain the connecting subject, meaning the resulting observable will not go \"cold\" again, and subsequent retries\n   * or resubscriptions will resubscribe to that same subject. In all cases, RxJS subjects will emit the same error again, however\n   * {@link ReplaySubject} will also push its buffered values before pushing the error.\n   * It is also possible to pass a notifier factory returning an `ObservableInput` instead which grants more fine-grained\n   * control over how and when the reset should happen. This allows behaviors like conditional or delayed resets.\n   */\n  resetOnError?: boolean | ((error: any) => ObservableInput<any>);\n  /**\n   * If `true`, the resulting observable will reset internal state on completion from source and return to a \"cold\" state. This\n   * allows the resulting observable to be \"repeated\" after it is done.\n   * If `false`, when the source completes, it will push the completion through the connecting subject, and the subject\n   * will remain the connecting subject, meaning the resulting observable will not go \"cold\" again, and subsequent repeats\n   * or resubscriptions will resubscribe to that same subject.\n   * It is also possible to pass a notifier factory returning an `ObservableInput` instead which grants more fine-grained\n   * control over how and when the reset should happen. This allows behaviors like conditional or delayed resets.\n   */\n  resetOnComplete?: boolean | (() => ObservableInput<any>);\n  /**\n   * If `true`, when the number of subscribers to the resulting observable reaches zero due to those subscribers unsubscribing, the\n   * internal state will be reset and the resulting observable will return to a \"cold\" state. This means that the next\n   * time the resulting observable is subscribed to, a new subject will be created and the source will be subscribed to\n   * again.\n   * If `false`, when the number of subscribers to the resulting observable reaches zero due to unsubscription, the subject\n   * will remain connected to the source, and new subscriptions to the result will be connected through that same subject.\n   * It is also possible to pass a notifier factory returning an `ObservableInput` instead which grants more fine-grained\n   * control over how and when the reset should happen. This allows behaviors like conditional or delayed resets.\n   */\n  resetOnRefCountZero?: boolean | (() => ObservableInput<any>);\n}\n\nexport function share<T>(): MonoTypeOperatorFunction<T>;\n\nexport function share<T>(options: ShareConfig<T>): MonoTypeOperatorFunction<T>;\n\n/**\n * Returns a new Observable that multicasts (shares) the original Observable. As long as there is at least one\n * Subscriber this Observable will be subscribed and emitting data. When all subscribers have unsubscribed it will\n * unsubscribe from the source Observable. Because the Observable is multicasting it makes the stream `hot`.\n * This is an alias for `multicast(() => new Subject()), refCount()`.\n *\n * The subscription to the underlying source Observable can be reset (unsubscribe and resubscribe for new subscribers),\n * if the subscriber count to the shared observable drops to 0, or if the source Observable errors or completes. It is\n * possible to use notifier factories for the resets to allow for behaviors like conditional or delayed resets. Please\n * note that resetting on error or complete of the source Observable does not behave like a transparent retry or restart\n * of the source because the error or complete will be forwarded to all subscribers and their subscription will be\n * closed. Only new subscribers after a reset on error or complete happened will cause a fresh subscription to the\n * source. To achieve transparent retries or restarts pipe the source through appropriate operators before sharing.\n *\n * ![](share.png)\n *\n * ## Example\n *\n * Generate new multicast Observable from the `source` Observable value\n *\n * ```ts\n * import { interval, tap, map, take, share } from 'rxjs';\n *\n * const source = interval(1000).pipe(\n *   tap(x => console.log('Processing: ', x)),\n *   map(x => x * x),\n *   take(6),\n *   share()\n * );\n *\n * source.subscribe(x => console.log('subscription 1: ', x));\n * source.subscribe(x => console.log('subscription 2: ', x));\n *\n * // Logs:\n * // Processing: 0\n * // subscription 1: 0\n * // subscription 2: 0\n * // Processing: 1\n * // subscription 1: 1\n * // subscription 2: 1\n * // Processing: 2\n * // subscription 1: 4\n * // subscription 2: 4\n * // Processing: 3\n * // subscription 1: 9\n * // subscription 2: 9\n * // Processing: 4\n * // subscription 1: 16\n * // subscription 2: 16\n * // Processing: 5\n * // subscription 1: 25\n * // subscription 2: 25\n * ```\n *\n * ## Example with notifier factory: Delayed reset\n *\n * ```ts\n * import { interval, take, share, timer } from 'rxjs';\n *\n * const source = interval(1000).pipe(\n *   take(3),\n *   share({\n *     resetOnRefCountZero: () => timer(1000)\n *   })\n * );\n *\n * const subscriptionOne = source.subscribe(x => console.log('subscription 1: ', x));\n * setTimeout(() => subscriptionOne.unsubscribe(), 1300);\n *\n * setTimeout(() => source.subscribe(x => console.log('subscription 2: ', x)), 1700);\n *\n * setTimeout(() => source.subscribe(x => console.log('subscription 3: ', x)), 5000);\n *\n * // Logs:\n * // subscription 1:  0\n * // (subscription 1 unsubscribes here)\n * // (subscription 2 subscribes here ~400ms later, source was not reset)\n * // subscription 2:  1\n * // subscription 2:  2\n * // (subscription 2 unsubscribes here)\n * // (subscription 3 subscribes here ~2000ms later, source did reset before)\n * // subscription 3:  0\n * // subscription 3:  1\n * // subscription 3:  2\n * ```\n *\n * @see {@link shareReplay}\n *\n * @return A function that returns an Observable that mirrors the source.\n */\nexport function share<T>(options: ShareConfig<T> = {}): MonoTypeOperatorFunction<T> {\n  const { connector = () => new Subject<T>(), resetOnError = true, resetOnComplete = true, resetOnRefCountZero = true } = options;\n  // It's necessary to use a wrapper here, as the _operator_ must be\n  // referentially transparent. Otherwise, it cannot be used in calls to the\n  // static `pipe` function - to create a partial pipeline.\n  //\n  // The _operator function_ - the function returned by the _operator_ - will\n  // not be referentially transparent - as it shares its source - but the\n  // _operator function_ is called when the complete pipeline is composed via a\n  // call to a source observable's `pipe` method - not when the static `pipe`\n  // function is called.\n  return (wrapperSource) => {\n    let connection: SafeSubscriber<T> | undefined;\n    let resetConnection: Subscription | undefined;\n    let subject: SubjectLike<T> | undefined;\n    let refCount = 0;\n    let hasCompleted = false;\n    let hasErrored = false;\n\n    const cancelReset = () => {\n      resetConnection?.unsubscribe();\n      resetConnection = undefined;\n    };\n    // Used to reset the internal state to a \"cold\"\n    // state, as though it had never been subscribed to.\n    const reset = () => {\n      cancelReset();\n      connection = subject = undefined;\n      hasCompleted = hasErrored = false;\n    };\n    const resetAndUnsubscribe = () => {\n      // We need to capture the connection before\n      // we reset (if we need to reset).\n      const conn = connection;\n      reset();\n      conn?.unsubscribe();\n    };\n\n    return operate<T, T>((source, subscriber) => {\n      refCount++;\n      if (!hasErrored && !hasCompleted) {\n        cancelReset();\n      }\n\n      // Create the subject if we don't have one yet. Grab a local reference to\n      // it as well, which avoids non-null assertions when using it and, if we\n      // connect to it now, then error/complete need a reference after it was\n      // reset.\n      const dest = (subject = subject ?? connector());\n\n      // Add the finalization directly to the subscriber - instead of returning it -\n      // so that the handling of the subscriber's unsubscription will be wired\n      // up _before_ the subscription to the source occurs. This is done so that\n      // the assignment to the source connection's `closed` property will be seen\n      // by synchronous firehose sources.\n      subscriber.add(() => {\n        refCount--;\n\n        // If we're resetting on refCount === 0, and it's 0, we only want to do\n        // that on \"unsubscribe\", really. Resetting on error or completion is a different\n        // configuration.\n        if (refCount === 0 && !hasErrored && !hasCompleted) {\n          resetConnection = handleReset(resetAndUnsubscribe, resetOnRefCountZero);\n        }\n      });\n\n      // The following line adds the subscription to the subscriber passed.\n      // Basically, `subscriber === dest.subscribe(subscriber)` is `true`.\n      dest.subscribe(subscriber);\n\n      if (\n        !connection &&\n        // Check this shareReplay is still activate - it can be reset to 0\n        // and be \"unsubscribed\" _before_ it actually subscribes.\n        // If we were to subscribe then, it'd leak and get stuck.\n        refCount > 0\n      ) {\n        // We need to create a subscriber here - rather than pass an observer and\n        // assign the returned subscription to connection - because it's possible\n        // for reentrant subscriptions to the shared observable to occur and in\n        // those situations we want connection to be already-assigned so that we\n        // don't create another connection to the source.\n        connection = new SafeSubscriber({\n          next: (value) => dest.next(value),\n          error: (err) => {\n            hasErrored = true;\n            cancelReset();\n            resetConnection = handleReset(reset, resetOnError, err);\n            dest.error(err);\n          },\n          complete: () => {\n            hasCompleted = true;\n            cancelReset();\n            resetConnection = handleReset(reset, resetOnComplete);\n            dest.complete();\n          },\n        });\n        innerFrom(source).subscribe(connection);\n      }\n    })(wrapperSource);\n  };\n}\n\nfunction handleReset<T extends unknown[] = never[]>(\n  reset: () => void,\n  on: boolean | ((...args: T) => ObservableInput<any>),\n  ...args: T\n): Subscription | undefined {\n  if (on === true) {\n    reset();\n    return;\n  }\n\n  if (on === false) {\n    return;\n  }\n\n  const onSubscriber = new SafeSubscriber({\n    next: () => {\n      onSubscriber.unsubscribe();\n      reset();\n    },\n  });\n\n  return innerFrom(on(...args)).subscribe(onSubscriber);\n}\n", "import { ReplaySubject } from '../ReplaySubject';\nimport { MonoTypeOperatorFunction, SchedulerLike } from '../types';\nimport { share } from './share';\n\nexport interface ShareReplayConfig {\n  bufferSize?: number;\n  windowTime?: number;\n  refCount: boolean;\n  scheduler?: SchedulerLike;\n}\n\nexport function shareReplay<T>(config: ShareReplayConfig): MonoTypeOperatorFunction<T>;\nexport function shareReplay<T>(bufferSize?: number, windowTime?: number, scheduler?: SchedulerLike): MonoTypeOperatorFunction<T>;\n\n/**\n * Share source and replay specified number of emissions on subscription.\n *\n * This operator is a specialization of `replay` that connects to a source observable\n * and multicasts through a `ReplaySubject` constructed with the specified arguments.\n * A successfully completed source will stay cached in the `shareReplay`ed observable forever,\n * but an errored source can be retried.\n *\n * ## Why use `shareReplay`?\n *\n * You generally want to use `shareReplay` when you have side-effects or taxing computations\n * that you do not wish to be executed amongst multiple subscribers.\n * It may also be valuable in situations where you know you will have late subscribers to\n * a stream that need access to previously emitted values.\n * This ability to replay values on subscription is what differentiates {@link share} and `shareReplay`.\n *\n * ## Reference counting\n *\n * By default `shareReplay` will use `refCount` of false, meaning that it will _not_ unsubscribe the\n * source when the reference counter drops to zero, i.e. the inner `ReplaySubject` will _not_ be unsubscribed\n * (and potentially run for ever).\n * This is the default as it is expected that `shareReplay` is often used to keep around expensive to setup\n * observables which we want to keep running instead of having to do the expensive setup again.\n *\n * As of RXJS version 6.4.0 a new overload signature was added to allow for manual control over what\n * happens when the operators internal reference counter drops to zero.\n * If `refCount` is true, the source will be unsubscribed from once the reference count drops to zero, i.e.\n * the inner `ReplaySubject` will be unsubscribed. All new subscribers will receive value emissions from a\n * new `ReplaySubject` which in turn will cause a new subscription to the source observable.\n *\n * ## Examples\n *\n * Example with a third subscriber coming late to the party\n *\n * ```ts\n * import { interval, take, shareReplay } from 'rxjs';\n *\n * const shared$ = interval(2000).pipe(\n *   take(6),\n *   shareReplay(3)\n * );\n *\n * shared$.subscribe(x => console.log('sub A: ', x));\n * shared$.subscribe(y => console.log('sub B: ', y));\n *\n * setTimeout(() => {\n *   shared$.subscribe(y => console.log('sub C: ', y));\n * }, 11000);\n *\n * // Logs:\n * // (after ~2000 ms)\n * // sub A: 0\n * // sub B: 0\n * // (after ~4000 ms)\n * // sub A: 1\n * // sub B: 1\n * // (after ~6000 ms)\n * // sub A: 2\n * // sub B: 2\n * // (after ~8000 ms)\n * // sub A: 3\n * // sub B: 3\n * // (after ~10000 ms)\n * // sub A: 4\n * // sub B: 4\n * // (after ~11000 ms, sub C gets the last 3 values)\n * // sub C: 2\n * // sub C: 3\n * // sub C: 4\n * // (after ~12000 ms)\n * // sub A: 5\n * // sub B: 5\n * // sub C: 5\n * ```\n *\n * Example for `refCount` usage\n *\n * ```ts\n * import { Observable, tap, interval, shareReplay, take } from 'rxjs';\n *\n * const log = <T>(name: string, source: Observable<T>) => source.pipe(\n *   tap({\n *     subscribe: () => console.log(`${ name }: subscribed`),\n *     next: value => console.log(`${ name }: ${ value }`),\n *     complete: () => console.log(`${ name }: completed`),\n *     finalize: () => console.log(`${ name }: unsubscribed`)\n *   })\n * );\n *\n * const obs$ = log('source', interval(1000));\n *\n * const shared$ = log('shared', obs$.pipe(\n *   shareReplay({ bufferSize: 1, refCount: true }),\n *   take(2)\n * ));\n *\n * shared$.subscribe(x => console.log('sub A: ', x));\n * shared$.subscribe(y => console.log('sub B: ', y));\n *\n * // PRINTS:\n * // shared: subscribed <-- reference count = 1\n * // source: subscribed\n * // shared: subscribed <-- reference count = 2\n * // source: 0\n * // shared: 0\n * // sub A: 0\n * // shared: 0\n * // sub B: 0\n * // source: 1\n * // shared: 1\n * // sub A: 1\n * // shared: completed <-- take(2) completes the subscription for sub A\n * // shared: unsubscribed <-- reference count = 1\n * // shared: 1\n * // sub B: 1\n * // shared: completed <-- take(2) completes the subscription for sub B\n * // shared: unsubscribed <-- reference count = 0\n * // source: unsubscribed <-- replaySubject unsubscribes from source observable because the reference count dropped to 0 and refCount is true\n *\n * // In case of refCount being false, the unsubscribe is never called on the source and the source would keep on emitting, even if no subscribers\n * // are listening.\n * // source: 2\n * // source: 3\n * // source: 4\n * // ...\n * ```\n *\n * @see {@link publish}\n * @see {@link share}\n * @see {@link publishReplay}\n *\n * @param configOrBufferSize Maximum element count of the replay buffer or {@link ShareReplayConfig configuration}\n * object.\n * @param windowTime Maximum time length of the replay buffer in milliseconds.\n * @param scheduler Scheduler where connected observers within the selector function\n * will be invoked on.\n * @return A function that returns an Observable sequence that contains the\n * elements of a sequence produced by multicasting the source sequence within a\n * selector function.\n */\nexport function shareReplay<T>(\n  configOrBufferSize?: ShareReplayConfig | number,\n  windowTime?: number,\n  scheduler?: SchedulerLike\n): MonoTypeOperatorFunction<T> {\n  let bufferSize: number;\n  let refCount = false;\n  if (configOrBufferSize && typeof configOrBufferSize === 'object') {\n    ({ bufferSize = Infinity, windowTime = Infinity, refCount = false, scheduler } = configOrBufferSize);\n  } else {\n    bufferSize = (configOrBufferSize ?? Infinity) as number;\n  }\n  return share<T>({\n    connector: () => new ReplaySubject(bufferSize, windowTime, scheduler),\n    resetOnError: true,\n    resetOnComplete: false,\n    resetOnRefCountZero: refCount,\n  });\n}\n", "import { MonoTypeOperatorFunction } from '../types';\nimport { filter } from './filter';\n\n/**\n * Returns an Observable that skips the first `count` items emitted by the source Observable.\n *\n * ![](skip.png)\n *\n * Skips the values until the sent notifications are equal or less than provided skip count. It raises\n * an error if skip count is equal or more than the actual number of emits and source raises an error.\n *\n * ## Example\n *\n * Skip the values before the emission\n *\n * ```ts\n * import { interval, skip } from 'rxjs';\n *\n * // emit every half second\n * const source = interval(500);\n * // skip the first 10 emitted values\n * const result = source.pipe(skip(10));\n *\n * result.subscribe(value => console.log(value));\n * // output: 10...11...12...13...\n * ```\n *\n * @see {@link last}\n * @see {@link skipWhile}\n * @see {@link skipUntil}\n * @see {@link skipLast}\n *\n * @param {Number} count - The number of times, items emitted by source Observable should be skipped.\n * @return A function that returns an Observable that skips the first `count`\n * values emitted by the source Observable.\n */\nexport function skip<T>(count: number): MonoTypeOperatorFunction<T> {\n  return filter((_, index) => count <= index);\n}\n", "import { MonoTypeOperatorFunction, ObservableInput } from '../types';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\nimport { innerFrom } from '../observable/innerFrom';\nimport { noop } from '../util/noop';\n\n/**\n * Returns an Observable that skips items emitted by the source Observable until a second Observable emits an item.\n *\n * The `skipUntil` operator causes the observable stream to skip the emission of values until the passed in observable\n * emits the first value. This can be particularly useful in combination with user interactions, responses of HTTP\n * requests or waiting for specific times to pass by.\n *\n * ![](skipUntil.png)\n *\n * Internally, the `skipUntil` operator subscribes to the passed in `notifier` `ObservableInput` (which gets converted\n * to an Observable) in order to recognize the emission of its first value. When `notifier` emits next, the operator\n * unsubscribes from it and starts emitting the values of the *source* observable until it completes or errors. It\n * will never let the *source* observable emit any values if the `notifier` completes or throws an error without\n * emitting a value before.\n *\n * ## Example\n *\n * In the following example, all emitted values of the interval observable are skipped until the user clicks anywhere\n * within the page\n *\n * ```ts\n * import { interval, fromEvent, skipUntil } from 'rxjs';\n *\n * const intervalObservable = interval(1000);\n * const click = fromEvent(document, 'click');\n *\n * const emitAfterClick = intervalObservable.pipe(\n *   skipUntil(click)\n * );\n * // clicked at 4.6s. output: 5...6...7...8........ or\n * // clicked at 7.3s. output: 8...9...10..11.......\n * emitAfterClick.subscribe(value => console.log(value));\n * ```\n *\n * @see {@link last}\n * @see {@link skip}\n * @see {@link skipWhile}\n * @see {@link skipLast}\n *\n * @param notifier An `ObservableInput` that has to emit an item before the source Observable elements begin to\n * be mirrored by the resulting Observable.\n * @return A function that returns an Observable that skips items from the\n * source Observable until the `notifier` Observable emits an item, then emits the\n * remaining items.\n */\nexport function skipUntil<T>(notifier: ObservableInput<any>): MonoTypeOperatorFunction<T> {\n  return operate((source, subscriber) => {\n    let taking = false;\n\n    const skipSubscriber = createOperatorSubscriber(\n      subscriber,\n      () => {\n        skipSubscriber?.unsubscribe();\n        taking = true;\n      },\n      noop\n    );\n\n    innerFrom(notifier).subscribe(skipSubscriber);\n\n    source.subscribe(createOperatorSubscriber(subscriber, (value) => taking && subscriber.next(value)));\n  });\n}\n", "import { concat } from '../observable/concat';\nimport { OperatorFunction, SchedulerLike, ValueFromArray } from '../types';\nimport { popScheduler } from '../util/args';\nimport { operate } from '../util/lift';\n\n// Devs are more likely to pass null or undefined than they are a scheduler\n// without accompanying values. To make things easier for (naughty) devs who\n// use the `strictNullChecks: false` TypeScript compiler option, these\n// overloads with explicit null and undefined values are included.\n\nexport function startWith<T>(value: null): OperatorFunction<T, T | null>;\nexport function startWith<T>(value: undefined): OperatorFunction<T, T | undefined>;\n\n/** @deprecated The `scheduler` parameter will be removed in v8. Use `scheduled` and `concatAll`. Details: https://rxjs.dev/deprecations/scheduler-argument */\nexport function startWith<T, A extends readonly unknown[] = T[]>(\n  ...valuesAndScheduler: [...A, SchedulerLike]\n): OperatorFunction<T, T | ValueFromArray<A>>;\nexport function startWith<T, A extends readonly unknown[] = T[]>(...values: A): OperatorFunction<T, T | ValueFromArray<A>>;\n\n/**\n * Returns an observable that, at the moment of subscription, will synchronously emit all\n * values provided to this operator, then subscribe to the source and mirror all of its emissions\n * to subscribers.\n *\n * This is a useful way to know when subscription has occurred on an existing observable.\n *\n * <span class=\"informal\">First emits its arguments in order, and then any\n * emissions from the source.</span>\n *\n * ![](startWith.png)\n *\n * ## Examples\n *\n * Emit a value when a timer starts.\n *\n * ```ts\n * import { timer, map, startWith } from 'rxjs';\n *\n * timer(1000)\n *   .pipe(\n *     map(() => 'timer emit'),\n *     startWith('timer start')\n *   )\n *   .subscribe(x => console.log(x));\n *\n * // results:\n * // 'timer start'\n * // 'timer emit'\n * ```\n *\n * @param values Items you want the modified Observable to emit first.\n * @return A function that returns an Observable that synchronously emits\n * provided values before subscribing to the source Observable.\n *\n * @see {@link endWith}\n * @see {@link finalize}\n * @see {@link concat}\n */\nexport function startWith<T, D>(...values: D[]): OperatorFunction<T, T | D> {\n  const scheduler = popScheduler(values);\n  return operate((source, subscriber) => {\n    // Here we can't pass `undefined` as a scheduler, because if we did, the\n    // code inside of `concat` would be confused by the `undefined`, and treat it\n    // like an invalid observable. So we have to split it two different ways.\n    (scheduler ? concat(values, source, scheduler) : concat(values, source)).subscribe(subscriber);\n  });\n}\n", "import { Subscriber } from '../Subscriber';\nimport { ObservableInput, OperatorFunction, ObservedValueOf } from '../types';\nimport { innerFrom } from '../observable/innerFrom';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\n\n/* tslint:disable:max-line-length */\nexport function switchMap<T, O extends ObservableInput<any>>(\n  project: (value: T, index: number) => O\n): OperatorFunction<T, ObservedValueOf<O>>;\n/** @deprecated The `resultSelector` parameter will be removed in v8. Use an inner `map` instead. Details: https://rxjs.dev/deprecations/resultSelector */\nexport function switchMap<T, O extends ObservableInput<any>>(\n  project: (value: T, index: number) => O,\n  resultSelector: undefined\n): OperatorFunction<T, ObservedValueOf<O>>;\n/** @deprecated The `resultSelector` parameter will be removed in v8. Use an inner `map` instead. Details: https://rxjs.dev/deprecations/resultSelector */\nexport function switchMap<T, R, O extends ObservableInput<any>>(\n  project: (value: T, index: number) => O,\n  resultSelector: (outerValue: T, innerValue: ObservedValueOf<O>, outerIndex: number, innerIndex: number) => R\n): OperatorFunction<T, R>;\n/* tslint:enable:max-line-length */\n\n/**\n * Projects each source value to an Observable which is merged in the output\n * Observable, emitting values only from the most recently projected Observable.\n *\n * <span class=\"informal\">Maps each value to an Observable, then flattens all of\n * these inner Observables using {@link switchAll}.</span>\n *\n * ![](switchMap.png)\n *\n * Returns an Observable that emits items based on applying a function that you\n * supply to each item emitted by the source Observable, where that function\n * returns an (so-called \"inner\") Observable. Each time it observes one of these\n * inner Observables, the output Observable begins emitting the items emitted by\n * that inner Observable. When a new inner Observable is emitted, `switchMap`\n * stops emitting items from the earlier-emitted inner Observable and begins\n * emitting items from the new one. It continues to behave like this for\n * subsequent inner Observables.\n *\n * ## Example\n *\n * Generate new Observable according to source Observable values\n *\n * ```ts\n * import { of, switchMap } from 'rxjs';\n *\n * const switched = of(1, 2, 3).pipe(switchMap(x => of(x, x ** 2, x ** 3)));\n * switched.subscribe(x => console.log(x));\n * // outputs\n * // 1\n * // 1\n * // 1\n * // 2\n * // 4\n * // 8\n * // 3\n * // 9\n * // 27\n * ```\n *\n * Restart an interval Observable on every click event\n *\n * ```ts\n * import { fromEvent, switchMap, interval } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const result = clicks.pipe(switchMap(() => interval(1000)));\n * result.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link concatMap}\n * @see {@link exhaustMap}\n * @see {@link mergeMap}\n * @see {@link switchAll}\n * @see {@link switchMapTo}\n *\n * @param {function(value: T, index: number): ObservableInput} project A function\n * that, when applied to an item emitted by the source Observable, returns an\n * Observable.\n * @return A function that returns an Observable that emits the result of\n * applying the projection function (and the optional deprecated\n * `resultSelector`) to each item emitted by the source Observable and taking\n * only the values from the most recently projected inner Observable.\n */\nexport function switchMap<T, R, O extends ObservableInput<any>>(\n  project: (value: T, index: number) => O,\n  resultSelector?: (outerValue: T, innerValue: ObservedValueOf<O>, outerIndex: number, innerIndex: number) => R\n): OperatorFunction<T, ObservedValueOf<O> | R> {\n  return operate((source, subscriber) => {\n    let innerSubscriber: Subscriber<ObservedValueOf<O>> | null = null;\n    let index = 0;\n    // Whether or not the source subscription has completed\n    let isComplete = false;\n\n    // We only complete the result if the source is complete AND we don't have an active inner subscription.\n    // This is called both when the source completes and when the inners complete.\n    const checkComplete = () => isComplete && !innerSubscriber && subscriber.complete();\n\n    source.subscribe(\n      createOperatorSubscriber(\n        subscriber,\n        (value) => {\n          // Cancel the previous inner subscription if there was one\n          innerSubscriber?.unsubscribe();\n          let innerIndex = 0;\n          const outerIndex = index++;\n          // Start the next inner subscription\n          innerFrom(project(value, outerIndex)).subscribe(\n            (innerSubscriber = createOperatorSubscriber(\n              subscriber,\n              // When we get a new inner value, next it through. Note that this is\n              // handling the deprecate result selector here. This is because with this architecture\n              // it ends up being smaller than using the map operator.\n              (innerValue) => subscriber.next(resultSelector ? resultSelector(value, innerValue, outerIndex, innerIndex++) : innerValue),\n              () => {\n                // The inner has completed. Null out the inner subscriber to\n                // free up memory and to signal that we have no inner subscription\n                // currently.\n                innerSubscriber = null!;\n                checkComplete();\n              }\n            ))\n          );\n        },\n        () => {\n          isComplete = true;\n          checkComplete();\n        }\n      )\n    );\n  });\n}\n", "import { MonoTypeOperatorFunction, ObservableInput } from '../types';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\nimport { innerFrom } from '../observable/innerFrom';\nimport { noop } from '../util/noop';\n\n/**\n * Emits the values emitted by the source Observable until a `notifier`\n * Observable emits a value.\n *\n * <span class=\"informal\">Lets values pass until a second Observable,\n * `notifier`, emits a value. Then, it completes.</span>\n *\n * ![](takeUntil.png)\n *\n * `takeUntil` subscribes and begins mirroring the source Observable. It also\n * monitors a second Observable, `notifier` that you provide. If the `notifier`\n * emits a value, the output Observable stops mirroring the source Observable\n * and completes. If the `notifier` doesn't emit any value and completes\n * then `takeUntil` will pass all values.\n *\n * ## Example\n *\n * Tick every second until the first click happens\n *\n * ```ts\n * import { interval, fromEvent, takeUntil } from 'rxjs';\n *\n * const source = interval(1000);\n * const clicks = fromEvent(document, 'click');\n * const result = source.pipe(takeUntil(clicks));\n * result.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link take}\n * @see {@link takeLast}\n * @see {@link takeWhile}\n * @see {@link skip}\n *\n * @param {Observable} notifier The Observable whose first emitted value will\n * cause the output Observable of `takeUntil` to stop emitting values from the\n * source Observable.\n * @return A function that returns an Observable that emits the values from the\n * source Observable until `notifier` emits its first value.\n */\nexport function takeUntil<T>(notifier: ObservableInput<any>): MonoTypeOperatorFunction<T> {\n  return operate((source, subscriber) => {\n    innerFrom(notifier).subscribe(createOperatorSubscriber(subscriber, () => subscriber.complete(), noop));\n    !subscriber.closed && source.subscribe(subscriber);\n  });\n}\n", "import { OperatorFunction, MonoTypeOperatorFunction, TruthyTypesOf } from '../types';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\n\nexport function takeWhile<T>(predicate: BooleanConstructor, inclusive: true): MonoTypeOperatorFunction<T>;\nexport function takeWhile<T>(predicate: BooleanConstructor, inclusive: false): OperatorFunction<T, TruthyTypesOf<T>>;\nexport function takeWhile<T>(predicate: BooleanConstructor): OperatorFunction<T, TruthyTypesOf<T>>;\nexport function takeWhile<T, S extends T>(predicate: (value: T, index: number) => value is S): OperatorFunction<T, S>;\nexport function takeWhile<T, S extends T>(predicate: (value: T, index: number) => value is S, inclusive: false): OperatorFunction<T, S>;\nexport function takeWhile<T>(predicate: (value: T, index: number) => boolean, inclusive?: boolean): MonoTypeOperatorFunction<T>;\n\n/**\n * Emits values emitted by the source Observable so long as each value satisfies\n * the given `predicate`, and then completes as soon as this `predicate` is not\n * satisfied.\n *\n * <span class=\"informal\">Takes values from the source only while they pass the\n * condition given. When the first value does not satisfy, it completes.</span>\n *\n * ![](takeWhile.png)\n *\n * `takeWhile` subscribes and begins mirroring the source Observable. Each value\n * emitted on the source is given to the `predicate` function which returns a\n * boolean, representing a condition to be satisfied by the source values. The\n * output Observable emits the source values until such time as the `predicate`\n * returns false, at which point `takeWhile` stops mirroring the source\n * Observable and completes the output Observable.\n *\n * ## Example\n *\n * Emit click events only while the clientX property is greater than 200\n *\n * ```ts\n * import { fromEvent, takeWhile } from 'rxjs';\n *\n * const clicks = fromEvent<PointerEvent>(document, 'click');\n * const result = clicks.pipe(takeWhile(ev => ev.clientX > 200));\n * result.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link take}\n * @see {@link takeLast}\n * @see {@link takeUntil}\n * @see {@link skip}\n *\n * @param {function(value: T, index: number): boolean} predicate A function that\n * evaluates a value emitted by the source Observable and returns a boolean.\n * Also takes the (zero-based) index as the second argument.\n * @param {boolean} inclusive When set to `true` the value that caused\n * `predicate` to return `false` will also be emitted.\n * @return A function that returns an Observable that emits values from the\n * source Observable so long as each value satisfies the condition defined by\n * the `predicate`, then completes.\n */\nexport function takeWhile<T>(predicate: (value: T, index: number) => boolean, inclusive = false): MonoTypeOperatorFunction<T> {\n  return operate((source, subscriber) => {\n    let index = 0;\n    source.subscribe(\n      createOperatorSubscriber(subscriber, (value) => {\n        const result = predicate(value, index++);\n        (result || inclusive) && subscriber.next(value);\n        !result && subscriber.complete();\n      })\n    );\n  });\n}\n", "import { MonoTypeOperatorFunction, Observer } from '../types';\nimport { isFunction } from '../util/isFunction';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\nimport { identity } from '../util/identity';\n\n/**\n * An extension to the {@link Observer} interface used only by the {@link tap} operator.\n *\n * It provides a useful set of callbacks a user can register to do side-effects in\n * cases other than what the usual {@link Observer} callbacks are\n * ({@link guide/glossary-and-semantics#next next},\n * {@link guide/glossary-and-semantics#error error} and/or\n * {@link guide/glossary-and-semantics#complete complete}).\n *\n * ## Example\n *\n * ```ts\n * import { fromEvent, switchMap, tap, interval, take } from 'rxjs';\n *\n * const source$ = fromEvent(document, 'click');\n * const result$ = source$.pipe(\n *   switchMap((_, i) => i % 2 === 0\n *     ? fromEvent(document, 'mousemove').pipe(\n *         tap({\n *           subscribe: () => console.log('Subscribed to the mouse move events after click #' + i),\n *           unsubscribe: () => console.log('Mouse move events #' + i + ' unsubscribed'),\n *           finalize: () => console.log('Mouse move events #' + i + ' finalized')\n *         })\n *       )\n *     : interval(1_000).pipe(\n *         take(5),\n *         tap({\n *           subscribe: () => console.log('Subscribed to the 1-second interval events after click #' + i),\n *           unsubscribe: () => console.log('1-second interval events #' + i + ' unsubscribed'),\n *           finalize: () => console.log('1-second interval events #' + i + ' finalized')\n *         })\n *       )\n *   )\n * );\n *\n * const subscription = result$.subscribe({\n *   next: console.log\n * });\n *\n * setTimeout(() => {\n *   console.log('Unsubscribe after 60 seconds');\n *   subscription.unsubscribe();\n * }, 60_000);\n * ```\n */\nexport interface TapObserver<T> extends Observer<T> {\n  /**\n   * The callback that `tap` operator invokes at the moment when the source Observable\n   * gets subscribed to.\n   */\n  subscribe: () => void;\n  /**\n   * The callback that `tap` operator invokes when an explicit\n   * {@link guide/glossary-and-semantics#unsubscription unsubscribe} happens. It won't get invoked on\n   * `error` or `complete` events.\n   */\n  unsubscribe: () => void;\n  /**\n   * The callback that `tap` operator invokes when any kind of\n   * {@link guide/glossary-and-semantics#finalization finalization} happens - either when\n   * the source Observable `error`s or `complete`s or when it gets explicitly unsubscribed\n   * by the user. There is no difference in using this callback or the {@link finalize}\n   * operator, but if you're already using `tap` operator, you can use this callback\n   * instead. You'd get the same result in either case.\n   */\n  finalize: () => void;\n}\nexport function tap<T>(observerOrNext?: Partial<TapObserver<T>> | ((value: T) => void)): MonoTypeOperatorFunction<T>;\n/** @deprecated Instead of passing separate callback arguments, use an observer argument. Signatures taking separate callback arguments will be removed in v8. Details: https://rxjs.dev/deprecations/subscribe-arguments */\nexport function tap<T>(\n  next?: ((value: T) => void) | null,\n  error?: ((error: any) => void) | null,\n  complete?: (() => void) | null\n): MonoTypeOperatorFunction<T>;\n\n/**\n * Used to perform side-effects for notifications from the source observable\n *\n * <span class=\"informal\">Used when you want to affect outside state with a notification without altering the notification</span>\n *\n * ![](tap.png)\n *\n * Tap is designed to allow the developer a designated place to perform side effects. While you _could_ perform side-effects\n * inside of a `map` or a `mergeMap`, that would make their mapping functions impure, which isn't always a big deal, but will\n * make it so you can't do things like memoize those functions. The `tap` operator is designed solely for such side-effects to\n * help you remove side-effects from other operations.\n *\n * For any notification, next, error, or complete, `tap` will call the appropriate callback you have provided to it, via a function\n * reference, or a partial observer, then pass that notification down the stream.\n *\n * The observable returned by `tap` is an exact mirror of the source, with one exception: Any error that occurs -- synchronously -- in a handler\n * provided to `tap` will be emitted as an error from the returned observable.\n *\n * > Be careful! You can mutate objects as they pass through the `tap` operator's handlers.\n *\n * The most common use of `tap` is actually for debugging. You can place a `tap(console.log)` anywhere\n * in your observable `pipe`, log out the notifications as they are emitted by the source returned by the previous\n * operation.\n *\n * ## Examples\n *\n * Check a random number before it is handled. Below is an observable that will use a random number between 0 and 1,\n * and emit `'big'` or `'small'` depending on the size of that number. But we wanted to log what the original number\n * was, so we have added a `tap(console.log)`.\n *\n * ```ts\n * import { of, tap, map } from 'rxjs';\n *\n * of(Math.random()).pipe(\n *   tap(console.log),\n *   map(n => n > 0.5 ? 'big' : 'small')\n * ).subscribe(console.log);\n * ```\n *\n * Using `tap` to analyze a value and force an error. Below is an observable where in our system we only\n * want to emit numbers 3 or less we get from another source. We can force our observable to error\n * using `tap`.\n *\n * ```ts\n * import { of, tap } from 'rxjs';\n *\n * const source = of(1, 2, 3, 4, 5);\n *\n * source.pipe(\n *   tap(n => {\n *     if (n > 3) {\n *       throw new TypeError(`Value ${ n } is greater than 3`);\n *     }\n *   })\n * )\n * .subscribe({ next: console.log, error: err => console.log(err.message) });\n * ```\n *\n * We want to know when an observable completes before moving on to the next observable. The system\n * below will emit a random series of `'X'` characters from 3 different observables in sequence. The\n * only way we know when one observable completes and moves to the next one, in this case, is because\n * we have added a `tap` with the side effect of logging to console.\n *\n * ```ts\n * import { of, concatMap, interval, take, map, tap } from 'rxjs';\n *\n * of(1, 2, 3).pipe(\n *   concatMap(n => interval(1000).pipe(\n *     take(Math.round(Math.random() * 10)),\n *     map(() => 'X'),\n *     tap({ complete: () => console.log(`Done with ${ n }`) })\n *   ))\n * )\n * .subscribe(console.log);\n * ```\n *\n * @see {@link finalize}\n * @see {@link TapObserver}\n *\n * @param observerOrNext A next handler or partial observer\n * @param error An error handler\n * @param complete A completion handler\n * @return A function that returns an Observable identical to the source, but\n * runs the specified Observer or callback(s) for each item.\n */\nexport function tap<T>(\n  observerOrNext?: Partial<TapObserver<T>> | ((value: T) => void) | null,\n  error?: ((e: any) => void) | null,\n  complete?: (() => void) | null\n): MonoTypeOperatorFunction<T> {\n  // We have to check to see not only if next is a function,\n  // but if error or complete were passed. This is because someone\n  // could technically call tap like `tap(null, fn)` or `tap(null, null, fn)`.\n  const tapObserver =\n    isFunction(observerOrNext) || error || complete\n      ? // tslint:disable-next-line: no-object-literal-type-assertion\n        ({ next: observerOrNext as Exclude<typeof observerOrNext, Partial<TapObserver<T>>>, error, complete } as Partial<TapObserver<T>>)\n      : observerOrNext;\n\n  return tapObserver\n    ? operate((source, subscriber) => {\n        tapObserver.subscribe?.();\n        let isUnsub = true;\n        source.subscribe(\n          createOperatorSubscriber(\n            subscriber,\n            (value) => {\n              tapObserver.next?.(value);\n              subscriber.next(value);\n            },\n            () => {\n              isUnsub = false;\n              tapObserver.complete?.();\n              subscriber.complete();\n            },\n            (err) => {\n              isUnsub = false;\n              tapObserver.error?.(err);\n              subscriber.error(err);\n            },\n            () => {\n              if (isUnsub) {\n                tapObserver.unsubscribe?.();\n              }\n              tapObserver.finalize?.();\n            }\n          )\n        );\n      })\n    : // Tap was called with no valid tap observer or handler\n      // (e.g. `tap(null, null, null)` or `tap(null)` or `tap()`)\n      // so we're going to just mirror the source.\n      identity;\n}\n", "import { Subscription } from '../Subscription';\n\nimport { MonoTypeOperatorFunction, ObservableInput } from '../types';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\nimport { innerFrom } from '../observable/innerFrom';\n\n/**\n * An object interface used by {@link throttle} or {@link throttleTime} that ensure\n * configuration options of these operators.\n *\n * @see {@link throttle}\n * @see {@link throttleTime}\n */\nexport interface ThrottleConfig {\n  /**\n   * If `true`, the resulting Observable will emit the first value from the source\n   * Observable at the **start** of the \"throttling\" process (when starting an\n   * internal timer that prevents other emissions from the source to pass through).\n   * If `false`, it will not emit the first value from the source Observable at the\n   * start of the \"throttling\" process.\n   *\n   * If not provided, defaults to: `true`.\n   */\n  leading?: boolean;\n  /**\n   * If `true`, the resulting Observable will emit the last value from the source\n   * Observable at the **end** of the \"throttling\" process (when ending an internal\n   * timer that prevents other emissions from the source to pass through).\n   * If `false`, it will not emit the last value from the source Observable at the\n   * end of the \"throttling\" process.\n   *\n   * If not provided, defaults to: `false`.\n   */\n  trailing?: boolean;\n}\n\n/**\n * Emits a value from the source Observable, then ignores subsequent source\n * values for a duration determined by another Observable, then repeats this\n * process.\n *\n * <span class=\"informal\">It's like {@link throttleTime}, but the silencing\n * duration is determined by a second Observable.</span>\n *\n * ![](throttle.svg)\n *\n * `throttle` emits the source Observable values on the output Observable\n * when its internal timer is disabled, and ignores source values when the timer\n * is enabled. Initially, the timer is disabled. As soon as the first source\n * value arrives, it is forwarded to the output Observable, and then the timer\n * is enabled by calling the `durationSelector` function with the source value,\n * which returns the \"duration\" Observable. When the duration Observable emits a\n * value, the timer is disabled, and this process repeats for the\n * next source value.\n *\n * ## Example\n *\n * Emit clicks at a rate of at most one click per second\n *\n * ```ts\n * import { fromEvent, throttle, interval } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const result = clicks.pipe(throttle(() => interval(1000)));\n *\n * result.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link audit}\n * @see {@link debounce}\n * @see {@link delayWhen}\n * @see {@link sample}\n * @see {@link throttleTime}\n *\n * @param durationSelector A function that receives a value from the source\n * Observable, for computing the silencing duration for each source value,\n * returned as an `ObservableInput`.\n * @param config A configuration object to define `leading` and `trailing`\n * behavior. Defaults to `{ leading: true, trailing: false }`.\n * @return A function that returns an Observable that performs the throttle\n * operation to limit the rate of emissions from the source.\n */\nexport function throttle<T>(durationSelector: (value: T) => ObservableInput<any>, config?: ThrottleConfig): MonoTypeOperatorFunction<T> {\n  return operate((source, subscriber) => {\n    const { leading = true, trailing = false } = config ?? {};\n    let hasValue = false;\n    let sendValue: T | null = null;\n    let throttled: Subscription | null = null;\n    let isComplete = false;\n\n    const endThrottling = () => {\n      throttled?.unsubscribe();\n      throttled = null;\n      if (trailing) {\n        send();\n        isComplete && subscriber.complete();\n      }\n    };\n\n    const cleanupThrottling = () => {\n      throttled = null;\n      isComplete && subscriber.complete();\n    };\n\n    const startThrottle = (value: T) =>\n      (throttled = innerFrom(durationSelector(value)).subscribe(createOperatorSubscriber(subscriber, endThrottling, cleanupThrottling)));\n\n    const send = () => {\n      if (hasValue) {\n        // Ensure we clear out our value and hasValue flag\n        // before we emit, otherwise reentrant code can cause\n        // issues here.\n        hasValue = false;\n        const value = sendValue!;\n        sendValue = null;\n        // Emit the value.\n        subscriber.next(value);\n        !isComplete && startThrottle(value);\n      }\n    };\n\n    source.subscribe(\n      createOperatorSubscriber(\n        subscriber,\n        // Regarding the presence of throttled.closed in the following\n        // conditions, if a synchronous duration selector is specified - weird,\n        // but legal - an already-closed subscription will be assigned to\n        // throttled, so the subscription's closed property needs to be checked,\n        // too.\n        (value) => {\n          hasValue = true;\n          sendValue = value;\n          !(throttled && !throttled.closed) && (leading ? send() : startThrottle(value));\n        },\n        () => {\n          isComplete = true;\n          !(trailing && hasValue && throttled && !throttled.closed) && subscriber.complete();\n        }\n      )\n    );\n  });\n}\n", "import { asyncScheduler } from '../scheduler/async';\nimport { throttle, ThrottleConfig } from './throttle';\nimport { MonoTypeOperatorFunction, SchedulerLike } from '../types';\nimport { timer } from '../observable/timer';\n\n/**\n * Emits a value from the source Observable, then ignores subsequent source\n * values for `duration` milliseconds, then repeats this process.\n *\n * <span class=\"informal\">Lets a value pass, then ignores source values for the\n * next `duration` milliseconds.</span>\n *\n * ![](throttleTime.png)\n *\n * `throttleTime` emits the source Observable values on the output Observable\n * when its internal timer is disabled, and ignores source values when the timer\n * is enabled. Initially, the timer is disabled. As soon as the first source\n * value arrives, it is forwarded to the output Observable, and then the timer\n * is enabled. After `duration` milliseconds (or the time unit determined\n * internally by the optional `scheduler`) has passed, the timer is disabled,\n * and this process repeats for the next source value. Optionally takes a\n * {@link SchedulerLike} for managing timers.\n *\n * ## Examples\n *\n * ### Limit click rate\n *\n * Emit clicks at a rate of at most one click per second\n *\n * ```ts\n * import { fromEvent, throttleTime } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const result = clicks.pipe(throttleTime(1000));\n *\n * result.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link auditTime}\n * @see {@link debounceTime}\n * @see {@link delay}\n * @see {@link sampleTime}\n * @see {@link throttle}\n *\n * @param duration Time to wait before emitting another value after\n * emitting the last value, measured in milliseconds or the time unit determined\n * internally by the optional `scheduler`.\n * @param scheduler The {@link SchedulerLike} to use for\n * managing the timers that handle the throttling. Defaults to {@link asyncScheduler}.\n * @param config A configuration object to define `leading` and\n * `trailing` behavior. Defaults to `{ leading: true, trailing: false }`.\n * @return A function that returns an Observable that performs the throttle\n * operation to limit the rate of emissions from the source.\n */\nexport function throttleTime<T>(\n  duration: number,\n  scheduler: SchedulerLike = asyncScheduler,\n  config?: ThrottleConfig\n): MonoTypeOperatorFunction<T> {\n  const duration$ = timer(duration, scheduler);\n  return throttle(() => duration$, config);\n}\n", "import { OperatorFunction, ObservableInputTuple } from '../types';\nimport { operate } from '../util/lift';\nimport { createOperatorSubscriber } from './OperatorSubscriber';\nimport { innerFrom } from '../observable/innerFrom';\nimport { identity } from '../util/identity';\nimport { noop } from '../util/noop';\nimport { popResultSelector } from '../util/args';\n\nexport function withLatestFrom<T, O extends unknown[]>(...inputs: [...ObservableInputTuple<O>]): OperatorFunction<T, [T, ...O]>;\n\nexport function withLatestFrom<T, O extends unknown[], R>(\n  ...inputs: [...ObservableInputTuple<O>, (...value: [T, ...O]) => R]\n): OperatorFunction<T, R>;\n\n/**\n * Combines the source Observable with other Observables to create an Observable\n * whose values are calculated from the latest values of each, only when the\n * source emits.\n *\n * <span class=\"informal\">Whenever the source Observable emits a value, it\n * computes a formula using that value plus the latest values from other input\n * Observables, then emits the output of that formula.</span>\n *\n * ![](withLatestFrom.png)\n *\n * `withLatestFrom` combines each value from the source Observable (the\n * instance) with the latest values from the other input Observables only when\n * the source emits a value, optionally using a `project` function to determine\n * the value to be emitted on the output Observable. All input Observables must\n * emit at least one value before the output Observable will emit a value.\n *\n * ## Example\n *\n * On every click event, emit an array with the latest timer event plus the click event\n *\n * ```ts\n * import { fromEvent, interval, withLatestFrom } from 'rxjs';\n *\n * const clicks = fromEvent(document, 'click');\n * const timer = interval(1000);\n * const result = clicks.pipe(withLatestFrom(timer));\n * result.subscribe(x => console.log(x));\n * ```\n *\n * @see {@link combineLatest}\n *\n * @param {ObservableInput} other An input Observable to combine with the source\n * Observable. More than one input Observables may be given as argument.\n * @param {Function} [project] Projection function for combining values\n * together. Receives all values in order of the Observables passed, where the\n * first parameter is a value from the source Observable. (e.g.\n * `a.pipe(withLatestFrom(b, c), map(([a1, b1, c1]) => a1 + b1 + c1))`). If this is not\n * passed, arrays will be emitted on the output Observable.\n * @return A function that returns an Observable of projected values from the\n * most recent values from each input Observable, or an array of the most\n * recent values from each input Observable.\n */\nexport function withLatestFrom<T, R>(...inputs: any[]): OperatorFunction<T, R | any[]> {\n  const project = popResultSelector(inputs) as ((...args: any[]) => R) | undefined;\n\n  return operate((source, subscriber) => {\n    const len = inputs.length;\n    const otherValues = new Array(len);\n    // An array of whether or not the other sources have emitted. Matched with them by index.\n    // TODO: At somepoint, we should investigate the performance implications here, and look\n    // into using a `Set()` and checking the `size` to see if we're ready.\n    let hasValue = inputs.map(() => false);\n    // Flipped true when we have at least one value from all other sources and\n    // we are ready to start emitting values.\n    let ready = false;\n\n    // Other sources. Note that here we are not checking `subscriber.closed`,\n    // this causes all inputs to be subscribed to, even if nothing can be emitted\n    // from them. This is an important distinction because subscription constitutes\n    // a side-effect.\n    for (let i = 0; i < len; i++) {\n      innerFrom(inputs[i]).subscribe(\n        createOperatorSubscriber(\n          subscriber,\n          (value) => {\n            otherValues[i] = value;\n            if (!ready && !hasValue[i]) {\n              // If we're not ready yet, flag to show this observable has emitted.\n              hasValue[i] = true;\n              // Intentionally terse code.\n              // If all of our other observables have emitted, set `ready` to `true`,\n              // so we know we can start emitting values, then clean up the `hasValue` array,\n              // because we don't need it anymore.\n              (ready = hasValue.every(identity)) && (hasValue = null!);\n            }\n          },\n          // Completing one of the other sources has\n          // no bearing on the completion of our result.\n          noop\n        )\n      );\n    }\n\n    // Source subscription\n    source.subscribe(\n      createOperatorSubscriber(subscriber, (value) => {\n        if (ready) {\n          // We have at least one value from the other sources. Go ahead and emit.\n          const values = [value, ...otherValues];\n          subscriber.next(project ? project(...values) : values);\n        }\n      })\n    );\n  });\n}\n", "import { zip as zipStatic } from '../observable/zip';\nimport { ObservableInput, ObservableInputTuple, OperatorFunction, Cons } from '../types';\nimport { operate } from '../util/lift';\n\n/** @deprecated Replaced with {@link zipWith}. Will be removed in v8. */\nexport function zip<T, A extends readonly unknown[]>(otherInputs: [...ObservableInputTuple<A>]): OperatorFunction<T, Cons<T, A>>;\n/** @deprecated Replaced with {@link zipWith}. Will be removed in v8. */\nexport function zip<T, A extends readonly unknown[], R>(\n  otherInputsAndProject: [...ObservableInputTuple<A>],\n  project: (...values: Cons<T, A>) => R\n): OperatorFunction<T, R>;\n/** @deprecated Replaced with {@link zipWith}. Will be removed in v8. */\nexport function zip<T, A extends readonly unknown[]>(...otherInputs: [...ObservableInputTuple<A>]): OperatorFunction<T, Cons<T, A>>;\n/** @deprecated Replaced with {@link zipWith}. Will be removed in v8. */\nexport function zip<T, A extends readonly unknown[], R>(\n  ...otherInputsAndProject: [...ObservableInputTuple<A>, (...values: Cons<T, A>) => R]\n): OperatorFunction<T, R>;\n\n/**\n * @deprecated Replaced with {@link zipWith}. Will be removed in v8.\n */\nexport function zip<T, R>(...sources: Array<ObservableInput<any> | ((...values: Array<any>) => R)>): OperatorFunction<T, any> {\n  return operate((source, subscriber) => {\n    zipStatic(source as ObservableInput<any>, ...(sources as Array<ObservableInput<any>>)).subscribe(subscriber);\n  });\n}\n", "import { ObservableInputTuple, OperatorFunction, Cons } from '../types';\nimport { zip } from './zip';\n\n/**\n * Subscribes to the source, and the observable inputs provided as arguments, and combines their values, by index, into arrays.\n *\n * What is meant by \"combine by index\": The first value from each will be made into a single array, then emitted,\n * then the second value from each will be combined into a single array and emitted, then the third value\n * from each will be combined into a single array and emitted, and so on.\n *\n * This will continue until it is no longer able to combine values of the same index into an array.\n *\n * After the last value from any one completed source is emitted in an array, the resulting observable will complete,\n * as there is no way to continue \"zipping\" values together by index.\n *\n * Use-cases for this operator are limited. There are memory concerns if one of the streams is emitting\n * values at a much faster rate than the others. Usage should likely be limited to streams that emit\n * at a similar pace, or finite streams of known length.\n *\n * In many cases, authors want `combineLatestWith` and not `zipWith`.\n *\n * @param otherInputs other observable inputs to collate values from.\n * @return A function that returns an Observable that emits items by index\n * combined from the source Observable and provided Observables, in form of an\n * array.\n */\nexport function zipWith<T, A extends readonly unknown[]>(...otherInputs: [...ObservableInputTuple<A>]): OperatorFunction<T, Cons<T, A>> {\n  return zip(...otherInputs);\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  ReplaySubject,\n  Subject,\n  fromEvent\n} from \"rxjs\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch document\n *\n * Documents are implemented as subjects, so all downstream observables are\n * automatically updated when a new document is emitted.\n *\n * @returns Document subject\n */\nexport function watchDocument(): Subject<Document> {\n  const document$ = new ReplaySubject<Document>(1)\n  fromEvent(document, \"DOMContentLoaded\", { once: true })\n    .subscribe(() => document$.next(document))\n\n  /* Return document */\n  return document$\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Retrieve all elements matching the query selector\n *\n * @template T - Element type\n *\n * @param selector - Query selector\n * @param node - Node of reference\n *\n * @returns Elements\n */\nexport function getElements<T extends keyof HTMLElementTagNameMap>(\n  selector: T, node?: ParentNode\n): HTMLElementTagNameMap[T][]\n\nexport function getElements<T extends HTMLElement>(\n  selector: string, node?: ParentNode\n): T[]\n\nexport function getElements<T extends HTMLElement>(\n  selector: string, node: ParentNode = document\n): T[] {\n  return Array.from(node.querySelectorAll<T>(selector))\n}\n\n/**\n * Retrieve an element matching a query selector or throw a reference error\n *\n * Note that this function assumes that the element is present. If unsure if an\n * element is existent, use the `getOptionalElement` function instead.\n *\n * @template T - Element type\n *\n * @param selector - Query selector\n * @param node - Node of reference\n *\n * @returns Element\n */\nexport function getElement<T extends keyof HTMLElementTagNameMap>(\n  selector: T, node?: ParentNode\n): HTMLElementTagNameMap[T]\n\nexport function getElement<T extends HTMLElement>(\n  selector: string, node?: ParentNode\n): T\n\nexport function getElement<T extends HTMLElement>(\n  selector: string, node: ParentNode = document\n): T {\n  const el = getOptionalElement<T>(selector, node)\n  if (typeof el === \"undefined\")\n    throw new ReferenceError(\n      `Missing element: expected \"${selector}\" to be present`\n    )\n\n  /* Return element */\n  return el\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Retrieve an optional element matching the query selector\n *\n * @template T - Element type\n *\n * @param selector - Query selector\n * @param node - Node of reference\n *\n * @returns Element or nothing\n */\nexport function getOptionalElement<T extends keyof HTMLElementTagNameMap>(\n  selector: T, node?: ParentNode\n): HTMLElementTagNameMap[T] | undefined\n\nexport function getOptionalElement<T extends HTMLElement>(\n  selector: string, node?: ParentNode\n): T | undefined\n\nexport function getOptionalElement<T extends HTMLElement>(\n  selector: string, node: ParentNode = document\n): T | undefined {\n  return node.querySelector<T>(selector) || undefined\n}\n\n/**\n * Retrieve the currently active element\n *\n * @returns Element or nothing\n */\nexport function getActiveElement(): HTMLElement | undefined {\n  return (\n    document.activeElement?.shadowRoot?.activeElement as HTMLElement ??\n    document.activeElement as HTMLElement ??\n    undefined\n  )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  debounceTime,\n  distinctUntilChanged,\n  fromEvent,\n  map,\n  merge,\n  shareReplay,\n  startWith\n} from \"rxjs\"\n\nimport { getActiveElement } from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Data\n * ------------------------------------------------------------------------- */\n\n/**\n * Focus observable\n *\n * Previously, this observer used `focus` and `blur` events to determine whether\n * an element is focused, but this doesn't work if there are focusable elements\n * within the elements itself. A better solutions are `focusin` and `focusout`\n * events, which bubble up the tree and allow for more fine-grained control.\n *\n * `debounceTime` is necessary, because when a focus change happens inside an\n * element, the observable would first emit `false` and then `true` again.\n */\nconst observer$ = merge(\n  fromEvent(document.body, \"focusin\"),\n  fromEvent(document.body, \"focusout\")\n)\n  .pipe(\n    debounceTime(1),\n    startWith(undefined),\n    map(() => getActiveElement() || document.body),\n    shareReplay(1)\n  )\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch element focus\n *\n * @param el - Element\n *\n * @returns Element focus observable\n */\nexport function watchElementFocus(\n  el: HTMLElement\n): Observable<boolean> {\n  return observer$\n    .pipe(\n      map(active => el.contains(active)),\n      distinctUntilChanged()\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  debounce,\n  defer,\n  fromEvent,\n  identity,\n  map,\n  merge,\n  startWith,\n  timer\n} from \"rxjs\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch element hover\n *\n * The second parameter allows to specify a timeout in milliseconds after which\n * the hover state will be reset to `false`. This is useful for tooltips which\n * should disappear after a certain amount of time, in order to allow the user\n * to move the cursor from the host to the tooltip.\n *\n * @param el - Element\n * @param timeout - Timeout\n *\n * @returns Element hover observable\n */\nexport function watchElementHover(\n  el: HTMLElement, timeout?: number\n): Observable<boolean> {\n  return defer(() => merge(\n    fromEvent(el, \"mouseenter\").pipe(map(() => true)),\n    fromEvent(el, \"mouseleave\").pipe(map(() => false))\n  )\n    .pipe(\n      timeout ? debounce(active => timer(+!active * timeout)) : identity,\n      startWith(el.matches(\":hover\"))\n    )\n  )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { JSX as JSXInternal } from \"preact\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * HTML attributes\n */\ntype Attributes =\n  & JSXInternal.HTMLAttributes\n  & JSXInternal.SVGAttributes\n  & Record<string, any>\n\n/**\n * Child element\n */\ntype Child =\n  | ChildNode\n  | HTMLElement\n  | Text\n  | string\n  | number\n\n/* ----------------------------------------------------------------------------\n * Helper functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Append a child node to an element\n *\n * @param el - Element\n * @param child - Child node(s)\n */\nfunction appendChild(el: HTMLElement, child: Child | Child[]): void {\n\n  /* Handle primitive types (including raw HTML) */\n  if (typeof child === \"string\" || typeof child === \"number\") {\n    el.innerHTML += child.toString()\n\n  /* Handle nodes */\n  } else if (child instanceof Node) {\n    el.appendChild(child)\n\n  /* Handle nested children */\n  } else if (Array.isArray(child)) {\n    for (const node of child)\n      appendChild(el, node)\n  }\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * JSX factory\n *\n * @template T - Element type\n *\n * @param tag - HTML tag\n * @param attributes - HTML attributes\n * @param children - Child elements\n *\n * @returns Element\n */\nexport function h<T extends keyof HTMLElementTagNameMap>(\n  tag: T, attributes?: Attributes | null, ...children: Child[]\n): HTMLElementTagNameMap[T]\n\nexport function h<T extends h.JSX.Element>(\n  tag: string, attributes?: Attributes | null, ...children: Child[]\n): T\n\nexport function h<T extends h.JSX.Element>(\n  tag: string, attributes?: Attributes | null, ...children: Child[]\n): T {\n  const el = document.createElement(tag)\n\n  /* Set attributes, if any */\n  if (attributes)\n    for (const attr of Object.keys(attributes)) {\n      if (typeof attributes[attr] === \"undefined\")\n        continue\n\n      /* Set default attribute or boolean */\n      if (typeof attributes[attr] !== \"boolean\")\n        el.setAttribute(attr, attributes[attr])\n      else\n        el.setAttribute(attr, \"\")\n    }\n\n  /* Append child nodes */\n  for (const child of children)\n    appendChild(el, child)\n\n  /* Return element */\n  return el as T\n}\n\n/* ----------------------------------------------------------------------------\n * Namespace\n * ------------------------------------------------------------------------- */\n\nexport declare namespace h {\n  namespace JSX {\n    type Element = HTMLElement\n    type IntrinsicElements = JSXInternal.IntrinsicElements\n  }\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Round a number for display with repository facts\n *\n * This is a reverse-engineered version of GitHub's weird rounding algorithm\n * for stars, forks and all other numbers. While all numbers below `1,000` are\n * returned as-is, bigger numbers are converted to fixed numbers:\n *\n * - `1,049` => `1k`\n * - `1,050` => `1.1k`\n * - `1,949` => `1.9k`\n * - `1,950` => `2k`\n *\n * @param value - Original value\n *\n * @returns Rounded value\n */\nexport function round(value: number): string {\n  if (value > 999) {\n    const digits = +((value - 950) % 1000 > 99)\n    return `${((value + 0.000001) / 1000).toFixed(digits)}k`\n  } else {\n    return value.toString()\n  }\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  defer,\n  finalize,\n  fromEvent,\n  map,\n  merge,\n  switchMap,\n  take,\n  throwError\n} from \"rxjs\"\n\nimport { h } from \"~/utilities\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Create and load a `script` element\n *\n * This function returns an observable that will emit when the script was\n * successfully loaded, or throw an error if it wasn't.\n *\n * @param src - Script URL\n *\n * @returns Script observable\n */\nexport function watchScript(src: string): Observable<void> {\n  const script = h(\"script\", { src })\n  return defer(() => {\n    document.head.appendChild(script)\n    return merge(\n      fromEvent(script, \"load\"),\n      fromEvent(script, \"error\")\n        .pipe(\n          switchMap(() => (\n            throwError(() => new ReferenceError(`Invalid script: ${src}`))\n          ))\n        )\n    )\n      .pipe(\n        map(() => undefined),\n        finalize(() => document.head.removeChild(script)),\n        take(1)\n      )\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  NEVER,\n  Observable,\n  Subject,\n  defer,\n  filter,\n  finalize,\n  map,\n  merge,\n  of,\n  shareReplay,\n  startWith,\n  switchMap,\n  tap\n} from \"rxjs\"\n\nimport { watchScript } from \"../../../script\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Element offset\n */\nexport interface ElementSize {\n  width: number                        /* Element width */\n  height: number                       /* Element height */\n}\n\n/* ----------------------------------------------------------------------------\n * Data\n * ------------------------------------------------------------------------- */\n\n/**\n * Resize observer entry subject\n */\nconst entry$ = new Subject<ResizeObserverEntry>()\n\n/**\n * Resize observer observable\n *\n * This observable will create a `ResizeObserver` on the first subscription\n * and will automatically terminate it when there are no more subscribers.\n * It's quite important to centralize observation in a single `ResizeObserver`,\n * as the performance difference can be quite dramatic, as the link shows.\n *\n * If the browser doesn't have a `ResizeObserver` implementation available, a\n * polyfill is automatically downloaded from unpkg.com. This is also compatible\n * with the built-in privacy plugin, which will download the polyfill and put\n * it alongside the built site for self-hosting.\n *\n * @see https://bit.ly/3iIYfEm - Google Groups on performance\n */\nconst observer$ = defer(() => (\n  typeof ResizeObserver === \"undefined\"\n    ? watchScript(\"https://unpkg.com/resize-observer-polyfill\")\n    : of(undefined)\n))\n  .pipe(\n    map(() => new ResizeObserver(entries => (\n      entries.forEach(entry => entry$.next(entry))\n    ))),\n    switchMap(observer => merge(NEVER, of(observer)).pipe(\n      finalize(() => observer.disconnect())\n    )),\n    shareReplay(1)\n  )\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Retrieve element size\n *\n * @param el - Element\n *\n * @returns Element size\n */\nexport function getElementSize(\n  el: HTMLElement\n): ElementSize {\n  return {\n    width:  el.offsetWidth,\n    height: el.offsetHeight\n  }\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Watch element size\n *\n * This function returns an observable that subscribes to a single internal\n * instance of `ResizeObserver` upon subscription, and emit resize events until\n * termination. Note that this function should not be called with the same\n * element twice, as the first unsubscription will terminate observation.\n *\n * Sadly, we can't use the `DOMRect` objects returned by the observer, because\n * we need the emitted values to be consistent with `getElementSize`, which will\n * return the used values (rounded) and not actual values (unrounded). Thus, we\n * use the `offset*` properties. See the linked GitHub issue.\n *\n * @see https://bit.ly/3m0k3he - GitHub issue\n *\n * @param el - Element\n *\n * @returns Element size observable\n */\nexport function watchElementSize(\n  el: HTMLElement\n): Observable<ElementSize> {\n\n  // Compute target element - since inline elements cannot be observed by the\n  // current `ResizeObserver` implementation as provided by browsers, we need\n  // to determine the first containing parent element and use that one as a\n  // target, while we always compute the actual size from the element.\n  let target = el\n  while (target.clientWidth === 0)\n    if (target.parentElement)\n      target = target.parentElement\n    else\n      break\n\n  // Observe target element and recompute element size on resize - as described\n  // above, the target element is not necessarily the element of interest\n  return observer$.pipe(\n    tap(observer => observer.observe(target)),\n    switchMap(observer => entry$.pipe(\n      filter(entry => entry.target === target),\n      finalize(() => observer.unobserve(target))\n    )),\n    map(() => getElementSize(el)),\n    startWith(getElementSize(el))\n  )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { ElementSize } from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Retrieve element content size (= scroll width and height)\n *\n * @param el - Element\n *\n * @returns Element content size\n */\nexport function getElementContentSize(\n  el: HTMLElement\n): ElementSize {\n  return {\n    width:  el.scrollWidth,\n    height: el.scrollHeight\n  }\n}\n\n/**\n * Retrieve the overflowing container of an element, if any\n *\n * @param el - Element\n *\n * @returns Overflowing container or nothing\n */\nexport function getElementContainer(\n  el: HTMLElement\n): HTMLElement | undefined {\n  let parent = el.parentElement\n  while (parent)\n    if (\n      el.scrollWidth  <= parent.scrollWidth &&\n      el.scrollHeight <= parent.scrollHeight\n    )\n      parent = (el = parent).parentElement\n    else\n      break\n\n  /* Return overflowing container */\n  return parent ? el : undefined\n}\n\n/**\n * Retrieve all overflowing containers of an element, if any\n *\n * Note that this function has a slightly different behavior, so we should at\n * some point consider refactoring how overflowing containers are handled.\n *\n * @param el - Element\n *\n * @returns Overflowing containers\n */\nexport function getElementContainers(\n  el: HTMLElement\n): HTMLElement[] {\n  const containers: HTMLElement[] = []\n\n  // Walk up the DOM tree until we find an overflowing container\n  let parent = el.parentElement\n  while (parent) {\n    if (\n      el.clientWidth  > parent.clientWidth ||\n      el.clientHeight > parent.clientHeight\n    )\n      containers.push(parent)\n\n    // Continue with parent element\n    parent = (el = parent).parentElement\n  }\n\n  // If the page is short, the body might not be overflowing and there might be\n  // no other containers, which is why we need to make sure the body is present\n  if (containers.length === 0)\n    containers.push(document.documentElement)\n\n  // Return overflowing containers\n  return containers\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  animationFrameScheduler,\n  auditTime,\n  fromEvent,\n  map,\n  merge,\n  startWith\n} from \"rxjs\"\n\nimport { watchElementSize } from \"../../size\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Element offset\n */\nexport interface ElementOffset {\n  x: number                            /* Horizontal offset */\n  y: number                            /* Vertical offset */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Retrieve element offset\n *\n * @param el - Element\n *\n * @returns Element offset\n */\nexport function getElementOffset(\n  el: HTMLElement\n): ElementOffset {\n  return {\n    x: el.offsetLeft,\n    y: el.offsetTop\n  }\n}\n\n/**\n * Retrieve absolute element offset\n *\n * @param el - Element\n *\n * @returns Element offset\n */\nexport function getElementOffsetAbsolute(\n  el: HTMLElement\n): ElementOffset {\n  const rect = el.getBoundingClientRect()\n  return {\n    x: rect.x + window.scrollX,\n    y: rect.y + window.scrollY\n  }\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Watch element offset\n *\n * @param el - Element\n *\n * @returns Element offset observable\n */\nexport function watchElementOffset(\n  el: HTMLElement\n): Observable<ElementOffset> {\n  return merge(\n    fromEvent(window, \"load\"),\n    fromEvent(window, \"resize\")\n  )\n    .pipe(\n      auditTime(0, animationFrameScheduler),\n      map(() => getElementOffset(el)),\n      startWith(getElementOffset(el))\n    )\n}\n\n/**\n * Watch absolute element offset\n *\n * @param el - Element\n *\n * @returns Element offset observable\n */\nexport function watchElementOffsetAbsolute(\n  el: HTMLElement\n): Observable<ElementOffset> {\n  return merge(\n    watchElementOffset(el),\n    watchElementSize(document.body) // @todo find a better way for this\n  )\n    .pipe(\n      map(() => getElementOffsetAbsolute(el)),\n      startWith(getElementOffsetAbsolute(el))\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  animationFrameScheduler,\n  auditTime,\n  fromEvent,\n  map,\n  merge,\n  startWith\n} from \"rxjs\"\n\nimport { ElementOffset } from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Retrieve element content offset (= scroll offset)\n *\n * @param el - Element\n *\n * @returns Element content offset\n */\nexport function getElementContentOffset(\n  el: HTMLElement\n): ElementOffset {\n  return {\n    x: el.scrollLeft,\n    y: el.scrollTop\n  }\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Watch element content offset\n *\n * @param el - Element\n *\n * @returns Element content offset observable\n */\nexport function watchElementContentOffset(\n  el: HTMLElement\n): Observable<ElementOffset> {\n  return merge(\n    fromEvent(el, \"scroll\"),\n    fromEvent(window, \"scroll\"),\n    fromEvent(window, \"resize\")\n  )\n    .pipe(\n      auditTime(0, animationFrameScheduler),\n      map(() => getElementContentOffset(el)),\n      startWith(getElementContentOffset(el))\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  NEVER,\n  Observable,\n  Subject,\n  defer,\n  distinctUntilChanged,\n  filter,\n  finalize,\n  map,\n  merge,\n  of,\n  shareReplay,\n  switchMap,\n  tap\n} from \"rxjs\"\n\nimport {\n  getElementContentSize,\n  getElementSize,\n  watchElementContentOffset\n} from \"~/browser\"\n\n/* ----------------------------------------------------------------------------\n * Data\n * ------------------------------------------------------------------------- */\n\n/**\n * Intersection observer entry subject\n */\nconst entry$ = new Subject<IntersectionObserverEntry>()\n\n/**\n * Intersection observer observable\n *\n * This observable will create an `IntersectionObserver` on first subscription\n * and will automatically terminate it when there are no more subscribers.\n *\n * @see https://bit.ly/3iIYfEm - Google Groups on performance\n */\nconst observer$ = defer(() => of(\n  new IntersectionObserver(entries => {\n    for (const entry of entries)\n      entry$.next(entry)\n  }, {\n    threshold: 0\n  })\n))\n  .pipe(\n    switchMap(observer => merge(NEVER, of(observer))\n      .pipe(\n        finalize(() => observer.disconnect())\n      )\n    ),\n    shareReplay(1)\n  )\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch element visibility\n *\n * @param el - Element\n *\n * @returns Element visibility observable\n */\nexport function watchElementVisibility(\n  el: HTMLElement\n): Observable<boolean> {\n  return observer$\n    .pipe(\n      tap(observer => observer.observe(el)),\n      switchMap(observer => entry$\n        .pipe(\n          filter(({ target }) => target === el),\n          finalize(() => observer.unobserve(el)),\n          map(({ isIntersecting }) => isIntersecting)\n        )\n      )\n    )\n}\n\n/**\n * Watch element boundary\n *\n * This function returns an observable which emits whether the bottom content\n * boundary (= scroll offset) of an element is within a certain threshold.\n *\n * @param el - Element\n * @param threshold - Threshold\n *\n * @returns Element boundary observable\n */\nexport function watchElementBoundary(\n  el: HTMLElement, threshold = 16\n): Observable<boolean> {\n  return watchElementContentOffset(el)\n    .pipe(\n      map(({ y }) => {\n        const visible = getElementSize(el)\n        const content = getElementContentSize(el)\n        return y >= (\n          content.height - visible.height - threshold\n        )\n      }),\n      distinctUntilChanged()\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  fromEvent,\n  map,\n  startWith\n} from \"rxjs\"\n\nimport { getElement } from \"../element\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Toggle\n */\nexport type Toggle =\n  | \"drawer\"                           /* Toggle for drawer */\n  | \"search\"                           /* Toggle for search */\n\n/* ----------------------------------------------------------------------------\n * Data\n * ------------------------------------------------------------------------- */\n\n/**\n * Toggle map\n */\nconst toggles: Record<Toggle, HTMLInputElement> = {\n  drawer: getElement(\"[data-md-toggle=drawer]\"),\n  search: getElement(\"[data-md-toggle=search]\")\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Retrieve the value of a toggle\n *\n * @param name - Toggle\n *\n * @returns Toggle value\n */\nexport function getToggle(name: Toggle): boolean {\n  return toggles[name].checked\n}\n\n/**\n * Set toggle\n *\n * Simulating a click event seems to be the most cross-browser compatible way\n * of changing the value while also emitting a `change` event. Before, Material\n * used `CustomEvent` to programmatically change the value of a toggle, but this\n * is a much simpler and cleaner solution which doesn't require a polyfill.\n *\n * @param name - Toggle\n * @param value - Toggle value\n */\nexport function setToggle(name: Toggle, value: boolean): void {\n  if (toggles[name].checked !== value)\n    toggles[name].click()\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Watch toggle\n *\n * @param name - Toggle\n *\n * @returns Toggle value observable\n */\nexport function watchToggle(name: Toggle): Observable<boolean> {\n  const el = toggles[name]\n  return fromEvent(el, \"change\")\n    .pipe(\n      map(() => el.checked),\n      startWith(el.checked)\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  EMPTY,\n  Observable,\n  filter,\n  fromEvent,\n  map,\n  merge,\n  share,\n  startWith,\n  switchMap\n} from \"rxjs\"\n\nimport { getActiveElement } from \"../element\"\nimport { getToggle } from \"../toggle\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Keyboard mode\n */\nexport type KeyboardMode =\n  | \"global\"                           /* Global */\n  | \"search\"                           /* Search is open */\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Keyboard\n */\nexport interface Keyboard {\n  mode: KeyboardMode                   /* Keyboard mode */\n  type: string                         /* Key type */\n  claim(): void                        /* Key claim */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Check whether an element may receive keyboard input\n *\n * @param el - Element\n * @param type - Key type\n *\n * @returns Test result\n */\nfunction isSusceptibleToKeyboard(\n  el: HTMLElement, type: string\n): boolean {\n  switch (el.constructor) {\n\n    /* Input elements */\n    case HTMLInputElement:\n      /* @ts-expect-error - omit unnecessary type cast */\n      if (el.type === \"radio\")\n        return /^Arrow/.test(type)\n      else\n        return true\n\n    /* Select element and textarea */\n    case HTMLSelectElement:\n    case HTMLTextAreaElement:\n      return true\n\n    /* Everything else */\n    default:\n      return el.isContentEditable\n  }\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch composition events\n *\n * @returns Composition observable\n */\nexport function watchComposition(): Observable<boolean> {\n  return merge(\n    fromEvent(window, \"compositionstart\").pipe(map(() => true)),\n    fromEvent(window, \"compositionend\").pipe(map(() => false))\n  )\n    .pipe(\n      startWith(false)\n    )\n}\n\n/**\n * Watch keyboard\n *\n * @returns Keyboard observable\n */\nexport function watchKeyboard(): Observable<Keyboard> {\n  const keyboard$ = fromEvent<KeyboardEvent>(window, \"keydown\")\n    .pipe(\n      filter(ev => !(ev.metaKey || ev.ctrlKey)),\n      map(ev => ({\n        mode: getToggle(\"search\") ? \"search\" : \"global\",\n        type: ev.key,\n        claim() {\n          ev.preventDefault()\n          ev.stopPropagation()\n        }\n      } as Keyboard)),\n      filter(({ mode, type }) => {\n        if (mode === \"global\") {\n          const active = getActiveElement()\n          if (typeof active !== \"undefined\")\n            return !isSusceptibleToKeyboard(active, type)\n        }\n        return true\n      }),\n      share()\n    )\n\n  /* Don't emit during composition events - see https://bit.ly/3te3Wl8 */\n  return watchComposition()\n    .pipe(\n      switchMap(active => !active ? keyboard$ : EMPTY)\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { Subject } from \"rxjs\"\n\nimport { feature } from \"~/_\"\nimport { h } from \"~/utilities\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Retrieve location\n *\n * This function returns a `URL` object (and not `Location`) to normalize the\n * typings across the application. Furthermore, locations need to be tracked\n * without setting them and `Location` is a singleton which represents the\n * current location.\n *\n * @returns URL\n */\nexport function getLocation(): URL {\n  return new URL(location.href)\n}\n\n/**\n * Set location\n *\n * If instant navigation is enabled, this function creates a temporary anchor\n * element, sets the `href` attribute, appends it to the body, clicks it, and\n * then removes it again. The event will bubble up the DOM and trigger be\n * intercepted by the instant loading business logic.\n *\n * Note that we must append and remove the anchor element, or the event will\n * not bubble up the DOM, making it impossible to intercept it.\n *\n * @param url - URL to navigate to\n * @param navigate - Force navigation\n */\nexport function setLocation(\n  url: URL | HTMLLinkElement, navigate = false\n): void {\n  if (feature(\"navigation.instant\") && !navigate) {\n    const el = h(\"a\", { href: url.href })\n    document.body.appendChild(el)\n    el.click()\n    el.remove()\n\n  // If we're not using instant navigation, and the page should not be reloaded\n  // just instruct the browser to navigate to the given URL\n  } else {\n    location.href = url.href\n  }\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Watch location\n *\n * @returns Location subject\n */\nexport function watchLocation(): Subject<URL> {\n  return new Subject<URL>()\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  filter,\n  fromEvent,\n  map,\n  merge,\n  shareReplay,\n  startWith\n} from \"rxjs\"\n\nimport { getOptionalElement } from \"~/browser\"\nimport { h } from \"~/utilities\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Retrieve location hash\n *\n * @returns Location hash\n */\nexport function getLocationHash(): string {\n  return location.hash.slice(1)\n}\n\n/**\n * Set location hash\n *\n * Setting a new fragment identifier via `location.hash` will have no effect\n * if the value doesn't change. When a new fragment identifier is set, we want\n * the browser to target the respective element at all times, which is why we\n * use this dirty little trick.\n *\n * @param hash - Location hash\n */\nexport function setLocationHash(hash: string): void {\n  const el = h(\"a\", { href: hash })\n  el.addEventListener(\"click\", ev => ev.stopPropagation())\n  el.click()\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Watch location hash\n *\n * @param location$ - Location observable\n *\n * @returns Location hash observable\n */\nexport function watchLocationHash(\n  location$: Observable<URL>\n): Observable<string> {\n  return merge(\n    fromEvent<HashChangeEvent>(window, \"hashchange\"),\n    location$\n  )\n    .pipe(\n      map(getLocationHash),\n      startWith(getLocationHash()),\n      filter(hash => hash.length > 0),\n      shareReplay(1)\n    )\n}\n\n/**\n * Watch location target\n *\n * @param location$ - Location observable\n *\n * @returns Location target observable\n */\nexport function watchLocationTarget(\n  location$: Observable<URL>\n): Observable<HTMLElement> {\n  return watchLocationHash(location$)\n    .pipe(\n      map(id => getOptionalElement(`[id=\"${id}\"]`)!),\n      filter(el => typeof el !== \"undefined\")\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  EMPTY,\n  Observable,\n  fromEvent,\n  fromEventPattern,\n  map,\n  merge,\n  startWith,\n  switchMap\n} from \"rxjs\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch media query\n *\n * Note that although `MediaQueryList.addListener` is deprecated we have to\n * use it, because it's the only way to ensure proper downward compatibility.\n *\n * @see https://bit.ly/3dUBH2m - GitHub issue\n *\n * @param query - Media query\n *\n * @returns Media observable\n */\nexport function watchMedia(query: string): Observable<boolean> {\n  const media = matchMedia(query)\n  return fromEventPattern<boolean>(next => (\n    media.addListener(() => next(media.matches))\n  ))\n    .pipe(\n      startWith(media.matches)\n    )\n}\n\n/**\n * Watch print mode\n *\n * @returns Print observable\n */\nexport function watchPrint(): Observable<boolean> {\n  const media = matchMedia(\"print\")\n  return merge(\n    fromEvent(window, \"beforeprint\").pipe(map(() => true)),\n    fromEvent(window, \"afterprint\").pipe(map(() => false))\n  )\n    .pipe(\n      startWith(media.matches)\n    )\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Toggle an observable with a media observable\n *\n * @template T - Data type\n *\n * @param query$ - Media observable\n * @param factory - Observable factory\n *\n * @returns Toggled observable\n */\nexport function at<T>(\n  query$: Observable<boolean>, factory: () => Observable<T>\n): Observable<T> {\n  return query$\n    .pipe(\n      switchMap(active => active ? factory() : EMPTY)\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  map,\n  shareReplay,\n  switchMap\n} from \"rxjs\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Options\n */\ninterface Options {\n  progress$?: Subject<number>          // Progress subject\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Fetch the given URL\n *\n * This function returns an observable that emits the response as a blob and\n * completes, or emits an error if the request failed. The caller can cancel\n * the request by unsubscribing at any time, which will automatically abort\n * the inflight request and complete the observable.\n *\n * Note that we use `XMLHTTPRequest` not because we're nostalgic, but because\n * it's the only way to get progress events for downloads and also allow for\n * cancellation of requests, as the official Fetch API does not support this\n * yet, even though we're already in 2024.\n *\n * @param url - Request URL\n * @param options - Options\n *\n * @returns Data observable\n */\nexport function request(\n  url: URL | string, options?: Options\n): Observable<Blob> {\n  return new Observable<Blob>(observer => {\n    const req = new XMLHttpRequest()\n    req.open(\"GET\", `${url}`)\n    req.responseType = \"blob\"\n\n    // Handle response\n    req.addEventListener(\"load\", () => {\n      if (req.status >= 200 && req.status < 300) {\n        observer.next(req.response)\n        observer.complete()\n\n      // Every response that is not in the 2xx range is considered an error\n      } else {\n        observer.error(new Error(req.statusText))\n      }\n    })\n\n    // Handle network errors\n    req.addEventListener(\"error\", () => {\n      observer.error(new Error(\"Network error\"))\n    })\n\n    // Handle aborted requests\n    req.addEventListener(\"abort\", () => {\n      observer.complete()\n    })\n\n    // Handle download progress\n    if (typeof options?.progress$ !== \"undefined\") {\n      req.addEventListener(\"progress\", event => {\n        if (event.lengthComputable) {\n          options.progress$!.next((event.loaded / event.total) * 100)\n\n        // Hack: Chromium doesn't report the total number of bytes if content\n        // is compressed, so we need this fallback - see https://t.ly/ZXofI\n        } else {\n          const length = req.getResponseHeader(\"Content-Length\") ?? 0\n          options.progress$!.next((event.loaded / +length) * 100)\n        }\n      })\n\n      // Immediately set progress to 5% to indicate that we're loading\n      options.progress$.next(5)\n    }\n\n    // Send request and automatically abort request upon unsubscription\n    req.send()\n    return () => req.abort()\n  })\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Fetch JSON from the given URL\n *\n * @template T - Data type\n *\n * @param url - Request URL\n * @param options - Options\n *\n * @returns Data observable\n */\nexport function requestJSON<T>(\n  url: URL | string, options?: Options\n): Observable<T> {\n  return request(url, options)\n    .pipe(\n      switchMap(res => res.text()),\n      map(body => JSON.parse(body) as T),\n      shareReplay(1)\n    )\n}\n\n/**\n * Fetch HTML from the given URL\n *\n * @param url - Request URL\n * @param options - Options\n *\n * @returns Data observable\n */\nexport function requestHTML(\n  url: URL | string, options?: Options\n): Observable<Document> {\n  const dom = new DOMParser()\n  return request(url, options)\n    .pipe(\n      switchMap(res => res.text()),\n      map(res => dom.parseFromString(res, \"text/html\")),\n      shareReplay(1)\n    )\n}\n\n/**\n * Fetch XML from the given URL\n *\n * @param url - Request URL\n * @param options - Options\n *\n * @returns Data observable\n */\nexport function requestXML(\n  url: URL | string, options?: Options\n): Observable<Document> {\n  const dom = new DOMParser()\n  return request(url, options)\n    .pipe(\n      switchMap(res => res.text()),\n      map(res => dom.parseFromString(res, \"text/xml\")),\n      shareReplay(1)\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  fromEvent,\n  map,\n  merge,\n  startWith\n} from \"rxjs\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Viewport offset\n */\nexport interface ViewportOffset {\n  x: number                            /* Horizontal offset */\n  y: number                            /* Vertical offset */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Retrieve viewport offset\n *\n * On iOS Safari, viewport offset can be negative due to overflow scrolling.\n * As this may induce strange behaviors downstream, we'll just limit it to 0.\n *\n * @returns Viewport offset\n */\nexport function getViewportOffset(): ViewportOffset {\n  return {\n    x: Math.max(0, scrollX),\n    y: Math.max(0, scrollY)\n  }\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Watch viewport offset\n *\n * @returns Viewport offset observable\n */\nexport function watchViewportOffset(): Observable<ViewportOffset> {\n  return merge(\n    fromEvent(window, \"scroll\", { passive: true }),\n    fromEvent(window, \"resize\", { passive: true })\n  )\n    .pipe(\n      map(getViewportOffset),\n      startWith(getViewportOffset())\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  fromEvent,\n  map,\n  startWith\n} from \"rxjs\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Viewport size\n */\nexport interface ViewportSize {\n  width: number                        /* Viewport width */\n  height: number                       /* Viewport height */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Retrieve viewport size\n *\n * @returns Viewport size\n */\nexport function getViewportSize(): ViewportSize {\n  return {\n    width:  innerWidth,\n    height: innerHeight\n  }\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Watch viewport size\n *\n * @returns Viewport size observable\n */\nexport function watchViewportSize(): Observable<ViewportSize> {\n  return fromEvent(window, \"resize\", { passive: true })\n    .pipe(\n      map(getViewportSize),\n      startWith(getViewportSize())\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  combineLatest,\n  map,\n  shareReplay\n} from \"rxjs\"\n\nimport {\n  ViewportOffset,\n  watchViewportOffset\n} from \"../offset\"\nimport {\n  ViewportSize,\n  watchViewportSize\n} from \"../size\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Viewport\n */\nexport interface Viewport {\n  offset: ViewportOffset               /* Viewport offset */\n  size: ViewportSize                   /* Viewport size */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch viewport\n *\n * @returns Viewport observable\n */\nexport function watchViewport(): Observable<Viewport> {\n  return combineLatest([\n    watchViewportOffset(),\n    watchViewportSize()\n  ])\n    .pipe(\n      map(([offset, size]) => ({ offset, size })),\n      shareReplay(1)\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  combineLatest,\n  distinctUntilKeyChanged,\n  map\n} from \"rxjs\"\n\nimport { Header } from \"~/components\"\n\nimport { getElementOffset } from \"../../element\"\nimport { Viewport } from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch options\n */\ninterface WatchOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  header$: Observable<Header>          /* Header observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch viewport relative to element\n *\n * @param el - Element\n * @param options - Options\n *\n * @returns Viewport observable\n */\nexport function watchViewportAt(\n  el: HTMLElement, { viewport$, header$ }: WatchOptions\n): Observable<Viewport> {\n  const size$ = viewport$\n    .pipe(\n      distinctUntilKeyChanged(\"size\")\n    )\n\n  /* Compute element offset */\n  const offset$ = combineLatest([size$, header$])\n    .pipe(\n      map(() => getElementOffset(el))\n    )\n\n  /* Compute relative viewport, return hot observable */\n  return combineLatest([header$, viewport$, offset$])\n    .pipe(\n      map(([{ height }, { offset, size }, { x, y }]) => ({\n        offset: {\n          x: offset.x - x,\n          y: offset.y - y + height\n        },\n        size\n      }))\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  endWith,\n  fromEvent,\n  ignoreElements,\n  mergeWith,\n  share,\n  takeUntil\n} from \"rxjs\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Worker message\n */\nexport interface WorkerMessage {\n  type: unknown                        /* Message type */\n  data?: unknown                       /* Message data */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Create an observable for receiving from a web worker\n *\n * @template T - Data type\n *\n * @param worker - Web worker\n *\n * @returns Message observable\n */\nfunction recv<T>(worker: Worker): Observable<T> {\n  return fromEvent<MessageEvent<T>, T>(worker, \"message\", ev => ev.data)\n}\n\n/**\n * Create a subject for sending to a web worker\n *\n * @template T - Data type\n *\n * @param worker - Web worker\n *\n * @returns Message subject\n */\nfunction send<T>(worker: Worker): Subject<T> {\n  const send$ = new Subject<T>()\n  send$.subscribe(data => worker.postMessage(data))\n\n  /* Return message subject */\n  return send$\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Create a bidirectional communication channel to a web worker\n *\n * @template T - Data type\n *\n * @param url - Worker URL\n * @param worker - Worker\n *\n * @returns Worker subject\n */\nexport function watchWorker<T extends WorkerMessage>(\n  url: string, worker = new Worker(url)\n): Subject<T> {\n  const recv$ = recv<T>(worker)\n  const send$ = send<T>(worker)\n\n  /* Create worker subject and forward messages */\n  const worker$ = new Subject<T>()\n  worker$.subscribe(send$)\n\n  /* Return worker subject */\n  const done$ = send$.pipe(ignoreElements(), endWith(true))\n  return worker$\n    .pipe(\n      ignoreElements(),\n      mergeWith(recv$.pipe(takeUntil(done$))),\n      share()\n    ) as Subject<T>\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { getElement, getLocation } from \"~/browser\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Feature flag\n */\nexport type Flag =\n  | \"announce.dismiss\"                 /* Dismissable announcement bar */\n  | \"content.code.annotate\"            /* Code annotations */\n  | \"content.code.copy\"                /* Code copy button */\n  | \"content.lazy\"                     /* Lazy content elements */\n  | \"content.tabs.link\"                /* Link content tabs */\n  | \"content.tooltips\"                 /* Tooltips */\n  | \"header.autohide\"                  /* Hide header */\n  | \"navigation.expand\"                /* Automatic expansion */\n  | \"navigation.indexes\"               /* Section pages */\n  | \"navigation.instant\"               /* Instant navigation */\n  | \"navigation.instant.progress\"      /* Instant navigation progress */\n  | \"navigation.sections\"              /* Section navigation */\n  | \"navigation.tabs\"                  /* Tabs navigation */\n  | \"navigation.tabs.sticky\"           /* Tabs navigation (sticky) */\n  | \"navigation.top\"                   /* Back-to-top button */\n  | \"navigation.tracking\"              /* Anchor tracking */\n  | \"search.highlight\"                 /* Search highlighting */\n  | \"search.share\"                     /* Search sharing */\n  | \"search.suggest\"                   /* Search suggestions */\n  | \"toc.follow\"                       /* Following table of contents */\n  | \"toc.integrate\"                    /* Integrated table of contents */\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Translation\n */\nexport type Translation =\n  | \"clipboard.copy\"                   /* Copy to clipboard */\n  | \"clipboard.copied\"                 /* Copied to clipboard */\n  | \"search.result.placeholder\"        /* Type to start searching */\n  | \"search.result.none\"               /* No matching documents */\n  | \"search.result.one\"                /* 1 matching document */\n  | \"search.result.other\"              /* # matching documents */\n  | \"search.result.more.one\"           /* 1 more on this page */\n  | \"search.result.more.other\"         /* # more on this page */\n  | \"search.result.term.missing\"       /* Missing */\n  | \"select.version\"                   /* Version selector */\n\n/**\n * Translations\n */\nexport type Translations =\n  Record<Translation, string>\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Versioning\n */\nexport interface Versioning {\n  provider: \"mike\"                     /* Version provider */\n  default?: string | string[]          /* Default version */\n  alias?: boolean                      /* Show alias */\n}\n\n/**\n * Configuration\n */\nexport interface Config {\n  base: string                         /* Base URL */\n  features: Flag[]                     /* Feature flags */\n  translations: Translations           /* Translations */\n  search: string                       /* Search worker URL */\n  tags?: Record<string, string>        /* Tags mapping */\n  version?: Versioning                 /* Versioning */\n}\n\n/* ----------------------------------------------------------------------------\n * Data\n * ------------------------------------------------------------------------- */\n\n/**\n * Retrieve global configuration and make base URL absolute\n */\nconst script = getElement(\"#__config\")\nconst config: Config = JSON.parse(script.textContent!)\nconfig.base = `${new URL(config.base, getLocation())}`\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Retrieve global configuration\n *\n * @returns Global configuration\n */\nexport function configuration(): Config {\n  return config\n}\n\n/**\n * Check whether a feature flag is enabled\n *\n * @param flag - Feature flag\n *\n * @returns Test result\n */\nexport function feature(flag: Flag): boolean {\n  return config.features.includes(flag)\n}\n\n/**\n * Retrieve the translation for the given key\n *\n * @param key - Key to be translated\n * @param value - Positional value, if any\n *\n * @returns Translation\n */\nexport function translation(\n  key: Translation, value?: string | number\n): string {\n  return typeof value !== \"undefined\"\n    ? config.translations[key].replace(\"#\", value.toString())\n    : config.translations[key]\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { getElement, getElements } from \"~/browser\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Component type\n */\nexport type ComponentType =\n  | \"announce\"                         /* Announcement bar */\n  | \"container\"                        /* Container */\n  | \"consent\"                          /* Consent */\n  | \"content\"                          /* Content */\n  | \"dialog\"                           /* Dialog */\n  | \"header\"                           /* Header */\n  | \"header-title\"                     /* Header title */\n  | \"header-topic\"                     /* Header topic */\n  | \"main\"                             /* Main area */\n  | \"outdated\"                         /* Version warning */\n  | \"palette\"                          /* Color palette */\n  | \"progress\"                         /* Progress indicator */\n  | \"search\"                           /* Search */\n  | \"search-query\"                     /* Search input */\n  | \"search-result\"                    /* Search results */\n  | \"search-share\"                     /* Search sharing */\n  | \"search-suggest\"                   /* Search suggestions */\n  | \"sidebar\"                          /* Sidebar */\n  | \"skip\"                             /* Skip link */\n  | \"source\"                           /* Repository information */\n  | \"tabs\"                             /* Navigation tabs */\n  | \"toc\"                              /* Table of contents */\n  | \"top\"                              /* Back-to-top button */\n\n/**\n * Component\n *\n * @template T - Component type\n * @template U - Reference type\n */\nexport type Component<\n  T extends {} = {},\n  U extends HTMLElement = HTMLElement\n> =\n  T & {\n    ref: U                             /* Component reference */\n  }\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Component type map\n */\ninterface ComponentTypeMap {\n  \"announce\": HTMLElement              /* Announcement bar */\n  \"container\": HTMLElement             /* Container */\n  \"consent\": HTMLElement               /* Consent */\n  \"content\": HTMLElement               /* Content */\n  \"dialog\": HTMLElement                /* Dialog */\n  \"header\": HTMLElement                /* Header */\n  \"header-title\": HTMLElement          /* Header title */\n  \"header-topic\": HTMLElement          /* Header topic */\n  \"main\": HTMLElement                  /* Main area */\n  \"outdated\": HTMLElement              /* Version warning */\n  \"palette\": HTMLElement               /* Color palette */\n  \"progress\": HTMLElement              /* Progress indicator */\n  \"search\": HTMLElement                /* Search */\n  \"search-query\": HTMLInputElement     /* Search input */\n  \"search-result\": HTMLElement         /* Search results */\n  \"search-share\": HTMLAnchorElement    /* Search sharing */\n  \"search-suggest\": HTMLElement        /* Search suggestions */\n  \"sidebar\": HTMLElement               /* Sidebar */\n  \"skip\": HTMLAnchorElement            /* Skip link */\n  \"source\": HTMLAnchorElement          /* Repository information */\n  \"tabs\": HTMLElement                  /* Navigation tabs */\n  \"toc\": HTMLElement                   /* Table of contents */\n  \"top\": HTMLAnchorElement             /* Back-to-top button */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Retrieve the element for a given component or throw a reference error\n *\n * @template T - Component type\n *\n * @param type - Component type\n * @param node - Node of reference\n *\n * @returns Element\n */\nexport function getComponentElement<T extends ComponentType>(\n  type: T, node: ParentNode = document\n): ComponentTypeMap[T] {\n  return getElement(`[data-md-component=${type}]`, node)\n}\n\n/**\n * Retrieve all elements for a given component\n *\n * @template T - Component type\n *\n * @param type - Component type\n * @param node - Node of reference\n *\n * @returns Elements\n */\nexport function getComponentElements<T extends ComponentType>(\n  type: T, node: ParentNode = document\n): ComponentTypeMap[T][] {\n  return getElements(`[data-md-component=${type}]`, node)\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  EMPTY,\n  Observable,\n  Subject,\n  defer,\n  finalize,\n  fromEvent,\n  map,\n  tap\n} from \"rxjs\"\n\nimport { feature } from \"~/_\"\nimport { getElement } from \"~/browser\"\n\nimport { Component } from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Announcement bar\n */\nexport interface Announce {\n  hash: number                        /* Content hash */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch announcement bar\n *\n * @param el - Announcement bar element\n *\n * @returns Announcement bar observable\n */\nexport function watchAnnounce(\n  el: HTMLElement\n): Observable<Announce> {\n  const button = getElement(\".md-typeset > :first-child\", el)\n  return fromEvent(button, \"click\", { once: true })\n    .pipe(\n      map(() => getElement(\".md-typeset\", el)),\n      map(content => ({ hash: __md_hash(content.innerHTML) }))\n    )\n}\n\n/**\n * Mount announcement bar\n *\n * @param el - Announcement bar element\n *\n * @returns Announcement bar component observable\n */\nexport function mountAnnounce(\n  el: HTMLElement\n): Observable<Component<Announce>> {\n  if (!feature(\"announce.dismiss\") || !el.childElementCount)\n    return EMPTY\n\n  /* Support instant navigation - see https://t.ly/3FTme */\n  if (!el.hidden) {\n    const content = getElement(\".md-typeset\", el)\n    if (__md_hash(content.innerHTML) === __md_get(\"__announce\"))\n      el.hidden = true\n  }\n\n  /* Mount component on subscription */\n  return defer(() => {\n    const push$ = new Subject<Announce>()\n    push$.subscribe(({ hash }) => {\n      el.hidden = true\n\n      /* Persist preference in local storage */\n      __md_set<number>(\"__announce\", hash)\n    })\n\n    /* Create and return component */\n    return watchAnnounce(el)\n      .pipe(\n        tap(state => push$.next(state)),\n        finalize(() => push$.complete()),\n        map(state => ({ ref: el, ...state }))\n      )\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  finalize,\n  map,\n  tap\n} from \"rxjs\"\n\nimport { Component } from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Consent\n */\nexport interface Consent {\n  hidden: boolean                      /* Consent is hidden */\n}\n\n/**\n * Consent defaults\n */\nexport interface ConsentDefaults {\n  analytics?: boolean                  /* Consent for Analytics */\n  github?: boolean                     /* Consent for GitHub */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch options\n */\ninterface WatchOptions {\n  target$: Observable<HTMLElement>     /* Target observable */\n}\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  target$: Observable<HTMLElement>     /* Target observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch consent\n *\n * @param el - Consent element\n * @param options - Options\n *\n * @returns Consent observable\n */\nexport function watchConsent(\n  el: HTMLElement, { target$ }: WatchOptions\n): Observable<Consent> {\n  return target$\n    .pipe(\n      map(target => ({ hidden: target !== el }))\n    )\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Mount consent\n *\n * @param el - Consent element\n * @param options - Options\n *\n * @returns Consent component observable\n */\nexport function mountConsent(\n  el: HTMLElement, options: MountOptions\n): Observable<Component<Consent>> {\n  const internal$ = new Subject<Consent>()\n  internal$.subscribe(({ hidden }) => {\n    el.hidden = hidden\n  })\n\n  /* Create and return component */\n  return watchConsent(el, options)\n    .pipe(\n      tap(state => internal$.next(state)),\n      finalize(() => internal$.complete()),\n      map(state => ({ ref: el, ...state }))\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { ComponentChild } from \"preact\"\n\nimport { h } from \"~/utilities\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Tooltip style\n */\nexport type TooltipStyle =\n  | \"inline\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Render a tooltip\n *\n * @param id - Tooltip identifier\n * @param style - Tooltip style\n *\n * @returns Element\n */\nexport function renderTooltip(\n  id?: string, style?: TooltipStyle\n): HTMLElement {\n  if (style === \"inline\") { // @todo refactor control flow\n    return (\n      <div class=\"md-tooltip md-tooltip--inline\" id={id} role=\"tooltip\">\n        <div class=\"md-tooltip__inner md-typeset\"></div>\n      </div>\n    )\n  } else {\n    return (\n      <div class=\"md-tooltip\" id={id} role=\"tooltip\">\n        <div class=\"md-tooltip__inner md-typeset\"></div>\n      </div>\n    )\n  }\n}\n\n// @todo: rename\nexport function renderInlineTooltip2(\n  ...children: ComponentChild[]\n): HTMLElement {\n  return (\n    <div class=\"md-tooltip2\" role=\"tooltip\">\n      <div class=\"md-tooltip2__inner md-typeset\">\n        {children}\n      </div>\n    </div>\n  )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { h } from \"~/utilities\"\n\nimport { renderTooltip } from \"../tooltip\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Render an annotation\n *\n * @param id - Annotation identifier\n * @param prefix - Tooltip identifier prefix\n *\n * @returns Element\n */\nexport function renderAnnotation(\n  id: string | number, prefix?: string\n): HTMLElement {\n  prefix = prefix ? `${prefix}_annotation_${id}` : undefined\n\n  /* Render tooltip with anchor, if given */\n  if (prefix) {\n    const anchor = prefix ? `#${prefix}` : undefined\n    return (\n      <aside class=\"md-annotation\" tabIndex={0}>\n        {renderTooltip(prefix)}\n        <a href={anchor} class=\"md-annotation__index\" tabIndex={-1}>\n          <span data-md-annotation-id={id}></span>\n        </a>\n      </aside>\n    )\n  } else {\n    return (\n      <aside class=\"md-annotation\" tabIndex={0}>\n        {renderTooltip(prefix)}\n        <span class=\"md-annotation__index\" tabIndex={-1}>\n          <span data-md-annotation-id={id}></span>\n        </span>\n      </aside>\n    )\n  }\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { translation } from \"~/_\"\nimport { h } from \"~/utilities\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Render a 'copy-to-clipboard' button\n *\n * @param id - Unique identifier\n *\n * @returns Element\n */\nexport function renderClipboardButton(id: string): HTMLElement {\n  return (\n    <button\n      class=\"md-clipboard md-icon\"\n      title={translation(\"clipboard.copy\")}\n      data-clipboard-target={`#${id} > code`}\n    ></button>\n  )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { ComponentChild } from \"preact\"\n\nimport { configuration, feature, translation } from \"~/_\"\nimport { SearchItem } from \"~/integrations/search\"\nimport { h } from \"~/utilities\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Render flag\n */\nconst enum Flag {\n  TEASER = 1,                          /* Render teaser */\n  PARENT = 2                           /* Render as parent */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper function\n * ------------------------------------------------------------------------- */\n\n/**\n * Render a search document\n *\n * @param document - Search document\n * @param flag - Render flags\n *\n * @returns Element\n */\nfunction renderSearchDocument(\n  document: SearchItem, flag: Flag\n): HTMLElement {\n  const parent = flag & Flag.PARENT\n  const teaser = flag & Flag.TEASER\n\n  /* Render missing query terms */\n  const missing = Object.keys(document.terms)\n    .filter(key => !document.terms[key])\n    .reduce<ComponentChild[]>((list, key) => [\n      ...list, <del>{key}</del>, \" \"\n    ], [])\n    .slice(0, -1)\n\n  /* Assemble query string for highlighting */\n  const config = configuration()\n  const url = new URL(document.location, config.base)\n  if (feature(\"search.highlight\"))\n    url.searchParams.set(\"h\", Object.entries(document.terms)\n      .filter(([, match]) => match)\n      .reduce((highlight, [value]) => `${highlight} ${value}`.trim(), \"\")\n    )\n\n  /* Render article or section, depending on flags */\n  const { tags } = configuration()\n  return (\n    <a href={`${url}`} class=\"md-search-result__link\" tabIndex={-1}>\n      <article\n        class=\"md-search-result__article md-typeset\"\n        data-md-score={document.score.toFixed(2)}\n      >\n        {parent > 0 && <div class=\"md-search-result__icon md-icon\"></div>}\n        {parent > 0 && <h1>{document.title}</h1>}\n        {parent <= 0 && <h2>{document.title}</h2>}\n        {teaser > 0 && document.text.length > 0 &&\n          document.text\n        }\n        {document.tags && document.tags.map(tag => {\n          const type = tags\n            ? tag in tags\n              ? `md-tag-icon md-tag--${tags[tag]}`\n              : \"md-tag-icon\"\n            : \"\"\n          return (\n            <span class={`md-tag ${type}`}>{tag}</span>\n          )\n        })}\n        {teaser > 0 && missing.length > 0 &&\n          <p class=\"md-search-result__terms\">\n            {translation(\"search.result.term.missing\")}: {...missing}\n          </p>\n        }\n      </article>\n    </a>\n  )\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Render a search result\n *\n * @param result - Search result\n *\n * @returns Element\n */\nexport function renderSearchResultItem(\n  result: SearchItem[]\n): HTMLElement {\n  const threshold = result[0].score\n  const docs = [...result]\n\n  const config = configuration()\n\n  /* Find and extract parent article */\n  const parent = docs.findIndex(doc => {\n    const l = `${new URL(doc.location, config.base)}` // @todo hacky\n    return !l.includes(\"#\")\n  })\n  const [article] = docs.splice(parent, 1)\n\n  /* Determine last index above threshold */\n  let index = docs.findIndex(doc => doc.score < threshold)\n  if (index === -1)\n    index = docs.length\n\n  /* Partition sections */\n  const best = docs.slice(0, index)\n  const more = docs.slice(index)\n\n  /* Render children */\n  const children = [\n    renderSearchDocument(article, Flag.PARENT | +(!parent && index === 0)),\n    ...best.map(section => renderSearchDocument(section, Flag.TEASER)),\n    ...more.length ? [\n      <details class=\"md-search-result__more\">\n        <summary tabIndex={-1}>\n          <div>\n            {more.length > 0 && more.length === 1\n              ? translation(\"search.result.more.one\")\n              : translation(\"search.result.more.other\", more.length)\n            }\n          </div>\n        </summary>\n        {...more.map(section => renderSearchDocument(section, Flag.TEASER))}\n      </details>\n    ] : []\n  ]\n\n  /* Render search result */\n  return (\n    <li class=\"md-search-result__item\">\n      {children}\n    </li>\n  )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { SourceFacts } from \"~/components\"\nimport { h, round } from \"~/utilities\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Render repository facts\n *\n * @param facts - Repository facts\n *\n * @returns Element\n */\nexport function renderSourceFacts(facts: SourceFacts): HTMLElement {\n  return (\n    <ul class=\"md-source__facts\">\n      {Object.entries(facts).map(([key, value]) => (\n        <li class={`md-source__fact md-source__fact--${key}`}>\n          {typeof value === \"number\" ? round(value) : value}\n        </li>\n      ))}\n    </ul>\n  )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { h } from \"~/utilities\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Tabbed control type\n */\ntype TabbedControlType =\n  | \"prev\"\n  | \"next\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Render control for content tabs\n *\n * @param type - Control type\n *\n * @returns Element\n */\nexport function renderTabbedControl(\n  type: TabbedControlType\n): HTMLElement {\n  const classes = `tabbed-control tabbed-control--${type}`\n  return (\n    <div class={classes} hidden>\n      <button class=\"tabbed-button\" tabIndex={-1} aria-hidden=\"true\"></button>\n    </div>\n  )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { h } from \"~/utilities\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Render a table inside a wrapper to improve scrolling on mobile\n *\n * @param table - Table element\n *\n * @returns Element\n */\nexport function renderTable(table: HTMLElement): HTMLElement {\n  return (\n    <div class=\"md-typeset__scrollwrap\">\n      <div class=\"md-typeset__table\">\n        {table}\n      </div>\n    </div>\n  )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { configuration, translation } from \"~/_\"\nimport { h } from \"~/utilities\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Version properties\n */\nexport interface VersionProperties {\n  hidden?: boolean                     /* Version is hidden */\n}\n\n/**\n * Version\n */\nexport interface Version {\n  version: string                      /* Version identifier */\n  title: string                        /* Version title */\n  aliases: string[]                    /* Version aliases */\n  properties?: VersionProperties       /* Version properties */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Render a version\n *\n * @param version - Version\n *\n * @returns Element\n */\nfunction renderVersion(version: Version): HTMLElement {\n  const config = configuration()\n\n  /* Ensure trailing slash - see https://bit.ly/3rL5u3f */\n  const url = new URL(`../${version.version}/`, config.base)\n  return (\n    <li class=\"md-version__item\">\n      <a href={`${url}`} class=\"md-version__link\">\n        {version.title}\n        {config.version?.alias && version.aliases.length > 0 && (\n          <span class=\"md-version__alias\">\n            {version.aliases[0]}\n          </span>\n        )}\n      </a>\n    </li>\n  )\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Render a version selector\n *\n * @param versions - Versions\n * @param active - Active version\n *\n * @returns Element\n */\nexport function renderVersionSelector(\n  versions: Version[], active: Version\n): HTMLElement {\n  const config = configuration()\n  versions = versions.filter(version => !version.properties?.hidden)\n  return (\n    <div class=\"md-version\">\n      <button\n        class=\"md-version__current\"\n        aria-label={translation(\"select.version\")}\n      >\n        {active.title}\n        {config.version?.alias && active.aliases.length > 0 && (\n          <span class=\"md-version__alias\">\n            {active.aliases[0]}\n          </span>\n        )}\n      </button>\n      <ul class=\"md-version__list\">\n        {versions.map(renderVersion)}\n      </ul>\n    </div>\n  )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  BehaviorSubject,\n  EMPTY,\n  Observable,\n  Subject,\n  animationFrameScheduler,\n  combineLatest,\n  debounce,\n  defer,\n  distinctUntilChanged,\n  endWith,\n  filter,\n  finalize,\n  first,\n  ignoreElements,\n  map,\n  mergeMap,\n  observeOn,\n  queueScheduler,\n  share,\n  startWith,\n  switchMap,\n  tap,\n  throttleTime,\n  timer,\n  withLatestFrom\n} from \"rxjs\"\n\nimport {\n  ElementOffset,\n  Viewport,\n  getElement,\n  getElementContainers,\n  getElementOffsetAbsolute,\n  getElementSize,\n  watchElementContentOffset,\n  watchElementFocus,\n  watchElementHover\n} from \"~/browser\"\nimport { renderInlineTooltip2 } from \"~/templates\"\n\nimport { Component } from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Tooltip\n */\nexport interface Tooltip {\n  active: boolean                      // Tooltip is active\n  offset: ElementOffset                // Tooltip offset\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Dependencies\n */\ninterface Dependencies {\n  content$: Observable<HTMLElement>    // Tooltip content observable\n  viewport$: Observable<Viewport>      // Viewport observable\n}\n\n/* ----------------------------------------------------------------------------\n * Data\n * ------------------------------------------------------------------------- */\n\n/**\n * Global sequence number for tooltips\n */\nlet sequence = 0\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch tooltip\n *\n * This function tracks the tooltip host element, and deduces the active state\n * and offset of the tooltip from it. The active state is determined by whether\n * the host element is focused or hovered, and the offset is determined by the\n * host element's absolute position in the document.\n *\n * @param el - Tooltip host element\n *\n * @returns Tooltip observable\n */\nexport function watchTooltip2(\n  el: HTMLElement\n): Observable<Tooltip> {\n\n  // Compute whether tooltip should be shown - we need to watch both focus and\n  // hover events on the host element and emit if one of them is active. In case\n  // of a hover event, we keep the element visible for a short amount of time\n  // after the pointer left the host element for a better user experience.\n  const active$ =\n    combineLatest([\n      watchElementFocus(el),\n      watchElementHover(el)\n    ])\n      .pipe(\n        map(([focus, hover]) => focus || hover),\n        distinctUntilChanged()\n      )\n\n  // We need to determine all parent elements of the host element that are\n  // currently scrollable, as they might affect the position of the tooltip\n  // depending on their horizontal of vertical offset. We must track all of\n  // them and recompute the position of the tooltip if they change.\n  const offset$ =\n    defer(() => getElementContainers(el)).pipe(\n      mergeMap(watchElementContentOffset),\n      throttleTime(1),\n      map(() => getElementOffsetAbsolute(el))\n    )\n\n  // Only track parent elements and compute offset of the tooltip host if the\n  // tooltip should be shown - we defer the computation of the offset until the\n  // tooltip becomes active for the first time. This is necessary, because we\n  // must also keep the tooltip active as long as it is focused or hovered.\n  return active$.pipe(\n    first(active => active),\n    switchMap(() => combineLatest([active$, offset$])),\n    map(([active, offset]) => ({ active, offset })),\n    share()\n  )\n}\n\n/**\n * Mount tooltip\n *\n * This function renders a tooltip with the content from the provided `content$`\n * observable as passed via the dependencies. If the returned element has a role\n * of type `dialog`, the tooltip is considered to be interactive, and rendered\n * either above or below the host element, depending on the available space.\n *\n * If the returned element has a role of type `tooltip`, the tooltip is always\n * rendered below the host element and considered to be non-interactive. This\n * allows us to reuse the same positioning logic for both interactive and\n * non-interactive tooltips, as it is largely the same.\n *\n * @param el - Tooltip host element\n * @param dependencies - Dependencies\n *\n * @returns Tooltip component observable\n */\nexport function mountTooltip2(\n  el: HTMLElement, dependencies: Dependencies\n): Observable<Component<Tooltip>> {\n  const { content$, viewport$ } = dependencies\n\n  // Compute unique tooltip id - this is necessary to associate the tooltip host\n  // element with the tooltip element for ARIA purposes\n  const id = `__tooltip2_${sequence++}`\n\n  // Create component on subscription\n  return defer(() => {\n    const push$ = new Subject<Tooltip>()\n\n    // Create subject to track tooltip presence and visibility - we use another\n    // purely internal subject to track the tooltip's presence and visibility,\n    // as the tooltip should be visible if the host element or tooltip itself\n    // is focused or hovered to allow for smooth pointer migration\n    const show$ = new BehaviorSubject(false)\n    push$.pipe(ignoreElements(), endWith(false))\n      .subscribe(show$)\n\n    // Create observable controlling tooltip element - we create and attach the\n    // tooltip only if it is actually present, in order to keep the number of\n    // elements low. We need to keep the tooltip visible for a short time after\n    // the pointer left the host element or tooltip itself. For this, we use an\n    // inner subscription to the tooltip observable, which we terminate when the\n    // tooltip should not be shown, automatically removing the element. Moreover\n    // we use the queue scheduler, which will schedule synchronously in case the\n    // tooltip should be shown, and asynchronously if it should be hidden.\n    const node$ = show$.pipe(\n      debounce(active => timer(+!active * 250, queueScheduler)),\n      distinctUntilChanged(),\n      switchMap(active => active ? content$ : EMPTY),\n      tap(node => node.id = id),\n      share()\n    )\n\n    // Compute tooltip presence and visibility - the tooltip should be shown if\n    // the host element or the tooltip itself is focused or hovered\n    combineLatest([\n      push$.pipe(map(({ active }) => active)),\n      node$.pipe(\n        switchMap(node => watchElementHover(node, 250)),\n        startWith(false)\n      )\n    ])\n      .pipe(map(states => states.some(active => active)))\n      .subscribe(show$)\n\n    // Compute tooltip origin - we need to compute the tooltip origin depending\n    // on the position of the host element, the viewport size, as well as the\n    // actual size of the tooltip, if positioned above. The tooltip must about\n    // to be rendered for this to be correct, which is why we do it here.\n    const origin$ = show$.pipe(\n      filter(active => active),\n      withLatestFrom(node$, viewport$),\n      map(([_, node, { size }]) => {\n        const host = el.getBoundingClientRect()\n        const x = host.width / 2\n\n        // If the tooltip is non-interactive, we always render it below the\n        // actual element because all operating systems do it that way\n        if (node.role === \"tooltip\") {\n          return { x, y: 8 + host.height }\n\n        // Otherwise, we determine where there is more space, and render the\n        // tooltip either above or below the host element\n        } else if (host.y >= size.height / 2) {\n          const { height } = getElementSize(node)\n          return { x, y: -16 - height }\n        } else {\n          return { x, y: +16 + host.height }\n        }\n      })\n    )\n\n    // Update tooltip position - we always need to update the position of the\n    // tooltip, as it might change depending on the viewport offset of the host\n    combineLatest([node$, push$, origin$])\n      .subscribe(([node, { offset }, origin]) => {\n        node.style.setProperty(\"--md-tooltip-host-x\", `${offset.x}px`)\n        node.style.setProperty(\"--md-tooltip-host-y\", `${offset.y}px`)\n\n        // Update tooltip origin - this is mainly set to determine the position\n        // of the tooltip tail, to show the direction it is originating from\n        node.style.setProperty(\"--md-tooltip-x\", `${origin.x}px`)\n        node.style.setProperty(\"--md-tooltip-y\", `${origin.y}px`)\n\n        // Update tooltip render location, i.e., whether the tooltip is shown\n        // above or below the host element, depending on the available space\n        node.classList.toggle(\"md-tooltip2--top\",    origin.y <  0)\n        node.classList.toggle(\"md-tooltip2--bottom\", origin.y >= 0)\n      })\n\n    // Update tooltip width - we only explicitly set the width of the tooltip\n    // if it is non-interactive, in case it should always be rendered centered\n    show$.pipe(\n      filter(active => active),\n      withLatestFrom(node$, (_, node) => node),\n      filter(node => node.role === \"tooltip\")\n    )\n      .subscribe(node => {\n        const size = getElementSize(getElement(\":scope > *\", node))\n\n        // Set tooltip width and remove tail by setting it to a width of zero -\n        // if authors want to keep the tail, we can move this to CSS later\n        node.style.setProperty(\"--md-tooltip-width\", `${size.width}px`)\n        node.style.setProperty(\"--md-tooltip-tail\",  `${0}px`)\n      })\n\n    // Update tooltip visibility - we defer to the next animation frame, because\n    // the tooltip must first be added to the document before we make it appear,\n    // or it will appear instantly without delay. Additionally, we need to keep\n    // the tooltip visible for a short time after the pointer left the host.\n    show$.pipe(\n      distinctUntilChanged(),\n      observeOn(animationFrameScheduler),\n      withLatestFrom(node$)\n    )\n      .subscribe(([active, node]) => {\n        node.classList.toggle(\"md-tooltip2--active\", active)\n      })\n\n    // Set up ARIA attributes when tooltip is visible\n    combineLatest([\n      show$.pipe(filter(active => active)),\n      node$\n    ])\n      .subscribe(([_, node]) => {\n        if (node.role === \"dialog\") {\n          el.setAttribute(\"aria-controls\", id)\n          el.setAttribute(\"aria-haspopup\", \"dialog\")\n        } else {\n          el.setAttribute(\"aria-describedby\", id)\n        }\n      })\n\n    // Remove ARIA attributes when tooltip is hidden\n    show$.pipe(filter(active => !active))\n      .subscribe(() => {\n        el.removeAttribute(\"aria-controls\")\n        el.removeAttribute(\"aria-describedby\")\n        el.removeAttribute(\"aria-haspopup\")\n      })\n\n    // Create and return component\n    return watchTooltip2(el)\n      .pipe(\n        tap(state => push$.next(state)),\n        finalize(() => push$.complete()),\n        map(state => ({ ref: el, ...state }))\n      )\n  })\n}\n\n// ----------------------------------------------------------------------------\n\n/**\n * Mount inline tooltip\n *\n * @todo refactor this function\n *\n * @param el - Tooltip host element\n * @param dependencies - Dependencies\n * @param container - Container\n *\n * @returns Tooltip component observable\n */\nexport function mountInlineTooltip2(\n  el: HTMLElement, { viewport$ }: { viewport$: Observable<Viewport> },\n  container = document.body\n): Observable<Component<Tooltip>> {\n  return mountTooltip2(el, {\n    content$: new Observable<HTMLElement>(observer => {\n      const title = el.title\n      const node = renderInlineTooltip2(title)\n      observer.next(node)\n      el.removeAttribute(\"title\")\n      // Append tooltip and remove on unsubscription\n      container.append(node)\n      return () => {\n        node.remove()\n        el.setAttribute(\"title\", title)\n      }\n    }),\n    viewport$\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  animationFrameScheduler,\n  auditTime,\n  combineLatest,\n  debounceTime,\n  defer,\n  delay,\n  endWith,\n  filter,\n  finalize,\n  fromEvent,\n  ignoreElements,\n  map,\n  merge,\n  switchMap,\n  take,\n  takeUntil,\n  tap,\n  throttleTime,\n  withLatestFrom\n} from \"rxjs\"\n\nimport {\n  ElementOffset,\n  getActiveElement,\n  getElementSize,\n  watchElementContentOffset,\n  watchElementFocus,\n  watchElementOffset,\n  watchElementVisibility\n} from \"~/browser\"\n\nimport { Component } from \"../../../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Annotation\n */\nexport interface Annotation {\n  active: boolean                      /* Annotation is active */\n  offset: ElementOffset                /* Annotation offset */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  target$: Observable<HTMLElement>     /* Location target observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch annotation\n *\n * @param el - Annotation element\n * @param container - Containing element\n *\n * @returns Annotation observable\n */\nexport function watchAnnotation(\n  el: HTMLElement, container: HTMLElement\n): Observable<Annotation> {\n  const offset$ = defer(() => combineLatest([\n    watchElementOffset(el),\n    watchElementContentOffset(container)\n  ]))\n    .pipe(\n      map(([{ x, y }, scroll]): ElementOffset => {\n        const { width, height } = getElementSize(el)\n        return ({\n          x: x - scroll.x + width  / 2,\n          y: y - scroll.y + height / 2\n        })\n      })\n    )\n\n  /* Actively watch annotation on focus */\n  return watchElementFocus(el)\n    .pipe(\n      switchMap(active => offset$\n        .pipe(\n          map(offset => ({ active, offset })),\n          take(+!active || Infinity)\n        )\n      )\n    )\n}\n\n/**\n * Mount annotation\n *\n * @param el - Annotation element\n * @param container - Containing element\n * @param options - Options\n *\n * @returns Annotation component observable\n */\nexport function mountAnnotation(\n  el: HTMLElement, container: HTMLElement, { target$ }: MountOptions\n): Observable<Component<Annotation>> {\n  const [tooltip, index] = Array.from(el.children)\n\n  /* Mount component on subscription */\n  return defer(() => {\n    const push$ = new Subject<Annotation>()\n    const done$ = push$.pipe(ignoreElements(), endWith(true))\n    push$.subscribe({\n\n      /* Handle emission */\n      next({ offset }) {\n        el.style.setProperty(\"--md-tooltip-x\", `${offset.x}px`)\n        el.style.setProperty(\"--md-tooltip-y\", `${offset.y}px`)\n      },\n\n      /* Handle complete */\n      complete() {\n        el.style.removeProperty(\"--md-tooltip-x\")\n        el.style.removeProperty(\"--md-tooltip-y\")\n      }\n    })\n\n    /* Start animation only when annotation is visible */\n    watchElementVisibility(el)\n      .pipe(\n        takeUntil(done$)\n      )\n        .subscribe(visible => {\n          el.toggleAttribute(\"data-md-visible\", visible)\n        })\n\n    /* Toggle tooltip presence to mitigate empty lines when copying */\n    merge(\n      push$.pipe(filter(({ active }) => active)),\n      push$.pipe(debounceTime(250), filter(({ active }) => !active))\n    )\n      .subscribe({\n\n        /* Handle emission */\n        next({ active }) {\n          if (active)\n            el.prepend(tooltip)\n          else\n            tooltip.remove()\n        },\n\n        /* Handle complete */\n        complete() {\n          el.prepend(tooltip)\n        }\n      })\n\n    /* Toggle tooltip visibility */\n    push$\n      .pipe(\n        auditTime(16, animationFrameScheduler)\n      )\n        .subscribe(({ active }) => {\n          tooltip.classList.toggle(\"md-tooltip--active\", active)\n        })\n\n    /* Track relative origin of tooltip */\n    push$\n      .pipe(\n        throttleTime(125, animationFrameScheduler),\n        filter(() => !!el.offsetParent),\n        map(() => el.offsetParent!.getBoundingClientRect()),\n        map(({ x }) => x)\n      )\n        .subscribe({\n\n          /* Handle emission */\n          next(origin) {\n            if (origin)\n              el.style.setProperty(\"--md-tooltip-0\", `${-origin}px`)\n            else\n              el.style.removeProperty(\"--md-tooltip-0\")\n          },\n\n          /* Handle complete */\n          complete() {\n            el.style.removeProperty(\"--md-tooltip-0\")\n          }\n        })\n\n    /* Allow to copy link without scrolling to anchor */\n    fromEvent<MouseEvent>(index, \"click\")\n      .pipe(\n        takeUntil(done$),\n        filter(ev => !(ev.metaKey || ev.ctrlKey))\n      )\n        .subscribe(ev => {\n          ev.stopPropagation()\n          ev.preventDefault()\n        })\n\n    /* Allow to open link in new tab or blur on close */\n    fromEvent<MouseEvent>(index, \"mousedown\")\n      .pipe(\n        takeUntil(done$),\n        withLatestFrom(push$)\n      )\n        .subscribe(([ev, { active }]) => {\n\n          /* Open in new tab */\n          if (ev.button !== 0 || ev.metaKey || ev.ctrlKey) {\n            ev.preventDefault()\n\n          /* Close annotation */\n          } else if (active) {\n            ev.preventDefault()\n\n            /* Focus parent annotation, if any */\n            const parent = el.parentElement!.closest(\".md-annotation\")\n            if (parent instanceof HTMLElement)\n              parent.focus()\n            else\n              getActiveElement()?.blur()\n          }\n        })\n\n    /* Open and focus annotation on location target */\n    target$\n      .pipe(\n        takeUntil(done$),\n        filter(target => target === tooltip),\n        delay(125)\n      )\n        .subscribe(() => el.focus())\n\n    /* Create and return component */\n    return watchAnnotation(el, container)\n      .pipe(\n        tap(state => push$.next(state)),\n        finalize(() => push$.complete()),\n        map(state => ({ ref: el, ...state }))\n      )\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  EMPTY,\n  Observable,\n  Subject,\n  defer,\n  endWith,\n  finalize,\n  ignoreElements,\n  merge,\n  share,\n  takeUntil\n} from \"rxjs\"\n\nimport {\n  getElement,\n  getElements,\n  getOptionalElement\n} from \"~/browser\"\nimport { renderAnnotation } from \"~/templates\"\n\nimport { Component } from \"../../../_\"\nimport {\n  Annotation,\n  mountAnnotation\n} from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  target$: Observable<HTMLElement>     /* Location target observable */\n  print$: Observable<boolean>          /* Media print observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Find all annotation hosts in the containing element\n *\n * @param container - Containing element\n *\n * @returns Annotation hosts\n */\nfunction findHosts(container: HTMLElement): HTMLElement[] {\n  return container.tagName === \"CODE\"\n    ? getElements(\".c, .c1, .cm\", container)\n    : [container]\n}\n\n/**\n * Find all annotation markers in the containing element\n *\n * @param container - Containing element\n *\n * @returns Annotation markers\n */\nfunction findMarkers(container: HTMLElement): Text[] {\n  const markers: Text[] = []\n  for (const el of findHosts(container)) {\n    const nodes: Text[] = []\n\n    /* Find all text nodes in current element */\n    const it = document.createNodeIterator(el, NodeFilter.SHOW_TEXT)\n    for (let node = it.nextNode(); node; node = it.nextNode())\n      nodes.push(node as Text)\n\n    /* Find all markers in each text node */\n    for (let text of nodes) {\n      let match: RegExpExecArray | null\n\n      /* Split text at marker and add to list */\n      while ((match = /(\\(\\d+\\))(!)?/.exec(text.textContent!))) {\n        const [, id, force] = match\n        if (typeof force === \"undefined\") {\n          const marker = text.splitText(match.index)\n          text = marker.splitText(id.length)\n          markers.push(marker)\n\n        /* Replace entire text with marker */\n        } else {\n          text.textContent = id\n          markers.push(text)\n          break\n        }\n      }\n    }\n  }\n  return markers\n}\n\n/**\n * Swap the child nodes of two elements\n *\n * @param source - Source element\n * @param target - Target element\n */\nfunction swap(source: HTMLElement, target: HTMLElement): void {\n  target.append(...Array.from(source.childNodes))\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount annotation list\n *\n * This function analyzes the containing code block and checks for markers\n * referring to elements in the given annotation list. If no markers are found,\n * the list is left untouched. Otherwise, list elements are rendered as\n * annotations inside the code block.\n *\n * @param el - Annotation list element\n * @param container - Containing element\n * @param options - Options\n *\n * @returns Annotation component observable\n */\nexport function mountAnnotationList(\n  el: HTMLElement, container: HTMLElement, { target$, print$ }: MountOptions\n): Observable<Component<Annotation>> {\n\n  /* Compute prefix for tooltip anchors */\n  const parent = container.closest(\"[id]\")\n  const prefix = parent?.id\n\n  /* Find and replace all markers with empty annotations */\n  const annotations = new Map<string, HTMLElement>()\n  for (const marker of findMarkers(container)) {\n    const [, id] = marker.textContent!.match(/\\((\\d+)\\)/)!\n    if (getOptionalElement(`:scope > li:nth-child(${id})`, el)) {\n      annotations.set(id, renderAnnotation(id, prefix))\n      marker.replaceWith(annotations.get(id)!)\n    }\n  }\n\n  /* Keep list if there are no annotations to render */\n  if (annotations.size === 0)\n    return EMPTY\n\n  /* Mount component on subscription */\n  return defer(() => {\n    const push$ = new Subject()\n    const done$ = push$.pipe(ignoreElements(), endWith(true))\n\n    /* Retrieve container pairs for swapping */\n    const pairs: [HTMLElement, HTMLElement][] = []\n    for (const [id, annotation] of annotations)\n      pairs.push([\n        getElement(\".md-typeset\", annotation),\n        getElement(`:scope > li:nth-child(${id})`, el)\n      ])\n\n    /* Handle print mode - see https://bit.ly/3rgPdpt */\n    print$.pipe(takeUntil(done$))\n      .subscribe(active => {\n        el.hidden = !active\n\n        /* Add class to discern list element */\n        el.classList.toggle(\"md-annotation-list\", active)\n\n        /* Show annotations in code block or list (print) */\n        for (const [inner, child] of pairs)\n          if (!active)\n            swap(child, inner)\n          else\n            swap(inner, child)\n      })\n\n    /* Create and return component */\n    return merge(...[...annotations]\n      .map(([, annotation]) => (\n        mountAnnotation(annotation, container, { target$ })\n      ))\n    )\n      .pipe(\n        finalize(() => push$.complete()),\n        share()\n      )\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { EMPTY, Observable, defer } from \"rxjs\"\n\nimport { Component } from \"../../../_\"\nimport { Annotation } from \"../_\"\nimport { mountAnnotationList } from \"../list\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  target$: Observable<HTMLElement>     /* Location target observable */\n  print$: Observable<boolean>          /* Media print observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Find list element directly following a block\n *\n * @param el - Annotation block element\n *\n * @returns List element or nothing\n */\nfunction findList(el: HTMLElement): HTMLElement | undefined {\n  if (el.nextElementSibling) {\n    const sibling = el.nextElementSibling as HTMLElement\n    if (sibling.tagName === \"OL\")\n      return sibling\n\n    /* Skip empty paragraphs - see https://bit.ly/3r4ZJ2O */\n    else if (sibling.tagName === \"P\" && !sibling.children.length)\n      return findList(sibling)\n  }\n\n  /* Everything else */\n  return undefined\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount annotation block\n *\n * @param el - Annotation block element\n * @param options - Options\n *\n * @returns Annotation component observable\n */\nexport function mountAnnotationBlock(\n  el: HTMLElement, options: MountOptions\n): Observable<Component<Annotation>> {\n  return defer(() => {\n    const list = findList(el)\n    return typeof list !== \"undefined\"\n      ? mountAnnotationList(list, el, options)\n      : EMPTY\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport ClipboardJS from \"clipboard\"\nimport {\n  EMPTY,\n  Observable,\n  Subject,\n  defer,\n  distinctUntilChanged,\n  distinctUntilKeyChanged,\n  filter,\n  finalize,\n  map,\n  mergeWith,\n  switchMap,\n  take,\n  takeLast,\n  takeUntil,\n  tap\n} from \"rxjs\"\n\nimport { feature } from \"~/_\"\nimport {\n  getElementContentSize,\n  getElements,\n  watchElementSize,\n  watchElementVisibility\n} from \"~/browser\"\nimport {\n  Tooltip,\n  mountInlineTooltip2\n} from \"~/components/tooltip2\"\nimport { renderClipboardButton } from \"~/templates\"\n\nimport { Component } from \"../../../_\"\nimport {\n  Annotation,\n  mountAnnotationList\n} from \"../../annotation\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Code block overflow\n */\nexport interface Overflow {\n  scrollable: boolean                  /* Code block overflows */\n}\n\n/**\n * Code block\n */\nexport type CodeBlock =\n  | Overflow\n  | Annotation\n  | Tooltip\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  target$: Observable<HTMLElement>     /* Location target observable */\n  print$: Observable<boolean>          /* Media print observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Data\n * ------------------------------------------------------------------------- */\n\n/**\n * Global sequence number for code blocks\n */\nlet sequence = 0\n\n/* ----------------------------------------------------------------------------\n * Helper functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Find candidate list element directly following a code block\n *\n * @param el - Code block element\n *\n * @returns List element or nothing\n */\nfunction findCandidateList(el: HTMLElement): HTMLElement | undefined {\n  if (el.nextElementSibling) {\n    const sibling = el.nextElementSibling as HTMLElement\n    if (sibling.tagName === \"OL\")\n      return sibling\n\n    /* Skip empty paragraphs - see https://bit.ly/3r4ZJ2O */\n    else if (sibling.tagName === \"P\" && !sibling.children.length)\n      return findCandidateList(sibling)\n  }\n\n  /* Everything else */\n  return undefined\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch code block\n *\n * This function monitors size changes of the viewport, as well as switches of\n * content tabs with embedded code blocks, as both may trigger overflow.\n *\n * @param el - Code block element\n *\n * @returns Code block observable\n */\nexport function watchCodeBlock(\n  el: HTMLElement\n): Observable<Overflow> {\n  return watchElementSize(el)\n    .pipe(\n      map(({ width }) => {\n        const content = getElementContentSize(el)\n        return {\n          scrollable: content.width > width\n        }\n      }),\n      distinctUntilKeyChanged(\"scrollable\")\n    )\n}\n\n/**\n * Mount code block\n *\n * This function ensures that an overflowing code block is focusable through\n * keyboard, so it can be scrolled without a mouse to improve on accessibility.\n * Furthermore, if code annotations are enabled, they are mounted if and only\n * if the code block is currently visible, e.g., not in a hidden content tab.\n *\n * Note that code blocks may be mounted eagerly or lazily. If they're mounted\n * lazily (on first visibility), code annotation anchor links will not work,\n * as they are evaluated on initial page load, and code annotations in general\n * might feel a little bumpier.\n *\n * @param el - Code block element\n * @param options - Options\n *\n * @returns Code block and annotation component observable\n */\nexport function mountCodeBlock(\n  el: HTMLElement, options: MountOptions\n): Observable<Component<CodeBlock>> {\n  const { matches: hover } = matchMedia(\"(hover)\")\n\n  /* Defer mounting of code block - see https://bit.ly/3vHVoVD */\n  const factory$ = defer(() => {\n    const push$ = new Subject<Overflow>()\n    const done$ = push$.pipe(takeLast(1))\n    push$.subscribe(({ scrollable }) => {\n      if (scrollable && hover)\n        el.setAttribute(\"tabindex\", \"0\")\n      else\n        el.removeAttribute(\"tabindex\")\n    })\n\n    /* Render button for Clipboard.js integration */\n    const content$: Array<Observable<Component<CodeBlock>>> = []\n    if (ClipboardJS.isSupported()) {\n      if (el.closest(\".copy\") || (\n        feature(\"content.code.copy\") && !el.closest(\".no-copy\")\n      )) {\n        const parent = el.closest(\"pre\")!\n        parent.id = `__code_${sequence++}`\n\n        /* Mount tooltip, if enabled */\n        const button = renderClipboardButton(parent.id)\n        parent.insertBefore(button, el)\n        if (feature(\"content.tooltips\"))\n          content$.push(mountInlineTooltip2(button, { viewport$ }))\n      }\n    }\n\n    /* Handle code annotations */\n    const container = el.closest(\".highlight\")\n    if (container instanceof HTMLElement) {\n      const list = findCandidateList(container)\n\n      /* Mount code annotations, if enabled */\n      if (typeof list !== \"undefined\" && (\n        container.classList.contains(\"annotate\") ||\n        feature(\"content.code.annotate\")\n      )) {\n        const annotations$ = mountAnnotationList(list, el, options)\n        content$.push(\n          watchElementSize(container)\n            .pipe(\n              takeUntil(done$),\n              map(({ width, height }) => width && height),\n              distinctUntilChanged(),\n              switchMap(active => active ? annotations$ : EMPTY)\n            )\n        )\n      }\n    }\n\n    // If the code block has line spans, we can add this additional class to\n    // the code block element, which fixes the problem for highlighted code\n    // lines not stretching to the entirety of the screen when the code block\n    // overflows, e.g., on mobile - see\n    const spans = getElements(\":scope > span[id]\", el)\n    if (spans.length)\n      el.classList.add(\"md-code__content\")\n\n    /* Create and return component */\n    return watchCodeBlock(el)\n      .pipe(\n        tap(state => push$.next(state)),\n        finalize(() => push$.complete()),\n        map(state => ({ ref: el, ...state })),\n        mergeWith(...content$)\n      )\n  })\n\n  /* Mount code block lazily */\n  if (feature(\"content.lazy\"))\n    return watchElementVisibility(el)\n      .pipe(\n        filter(visible => visible),\n        take(1),\n        switchMap(() => factory$)\n      )\n\n  /* Mount code block */\n  return factory$\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  defer,\n  filter,\n  finalize,\n  map,\n  merge,\n  tap\n} from \"rxjs\"\n\nimport { Component } from \"../../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Details\n */\nexport interface Details {\n  action: \"open\" | \"close\"             /* Details state */\n  reveal?: boolean                     /* Details is revealed */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch options\n */\ninterface WatchOptions {\n  target$: Observable<HTMLElement>     /* Location target observable */\n  print$: Observable<boolean>          /* Media print observable */\n}\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  target$: Observable<HTMLElement>     /* Location target observable */\n  print$: Observable<boolean>          /* Media print observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch details\n *\n * @param el - Details element\n * @param options - Options\n *\n * @returns Details observable\n */\nexport function watchDetails(\n  el: HTMLDetailsElement, { target$, print$ }: WatchOptions\n): Observable<Details> {\n  let open = true\n  return merge(\n\n    /* Open and focus details on location target */\n    target$\n      .pipe(\n        map(target => target.closest(\"details:not([open])\")!),\n        filter(details => el === details),\n        map(() => ({\n          action: \"open\", reveal: true\n        }) as Details)\n      ),\n\n    /* Open details on print and close afterwards */\n    print$\n      .pipe(\n        filter(active => active || !open),\n        tap(() => open = el.open),\n        map(active => ({\n          action: active ? \"open\" : \"close\"\n        }) as Details)\n      )\n  )\n}\n\n/**\n * Mount details\n *\n * This function ensures that `details` tags are opened on anchor jumps and\n * prior to printing, so the whole content of the page is visible.\n *\n * @param el - Details element\n * @param options - Options\n *\n * @returns Details component observable\n */\nexport function mountDetails(\n  el: HTMLDetailsElement, options: MountOptions\n): Observable<Component<Details>> {\n  return defer(() => {\n    const push$ = new Subject<Details>()\n    push$.subscribe(({ action, reveal }) => {\n      el.toggleAttribute(\"open\", action === \"open\")\n      if (reveal)\n        el.scrollIntoView()\n    })\n\n    /* Create and return component */\n    return watchDetails(el, options)\n      .pipe(\n        tap(state => push$.next(state)),\n        finalize(() => push$.complete()),\n        map(state => ({ ref: el, ...state }))\n      )\n  })\n}\n", ".node circle,.node ellipse,.node path,.node polygon,.node rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}marker{fill:var(--md-mermaid-edge-color)!important}.edgeLabel .label rect{fill:#0000}.label{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.label foreignObject{line-height:normal;overflow:visible}.label div .edgeLabel{color:var(--md-mermaid-label-fg-color)}.edgeLabel,.edgeLabel rect,.label div .edgeLabel{background-color:var(--md-mermaid-label-bg-color)}.edgeLabel,.edgeLabel rect{fill:var(--md-mermaid-label-bg-color);color:var(--md-mermaid-edge-color)}.edgePath .path,.flowchart-link{stroke:var(--md-mermaid-edge-color);stroke-width:.05rem}.edgePath .arrowheadPath{fill:var(--md-mermaid-edge-color);stroke:none}.cluster rect{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}.cluster span{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}g #flowchart-circleEnd,g #flowchart-circleStart,g #flowchart-crossEnd,g #flowchart-crossStart,g #flowchart-pointEnd,g #flowchart-pointStart{stroke:none}g.classGroup line,g.classGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.classGroup text{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.classLabel .box{fill:var(--md-mermaid-label-bg-color);background-color:var(--md-mermaid-label-bg-color);opacity:1}.classLabel .label{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.node .divider{stroke:var(--md-mermaid-node-fg-color)}.relation{stroke:var(--md-mermaid-edge-color)}.cardinality{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.cardinality text{fill:inherit!important}defs #classDiagram-compositionEnd,defs #classDiagram-compositionStart,defs #classDiagram-dependencyEnd,defs #classDiagram-dependencyStart,defs #classDiagram-extensionEnd,defs #classDiagram-extensionStart{fill:var(--md-mermaid-edge-color)!important;stroke:var(--md-mermaid-edge-color)!important}defs #classDiagram-aggregationEnd,defs #classDiagram-aggregationStart{fill:var(--md-mermaid-label-bg-color)!important;stroke:var(--md-mermaid-edge-color)!important}g.stateGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.stateGroup .state-title{fill:var(--md-mermaid-label-fg-color)!important;font-family:var(--md-mermaid-font-family)}g.stateGroup .composit{fill:var(--md-mermaid-label-bg-color)}.nodeLabel,.nodeLabel p{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}a .nodeLabel{text-decoration:underline}.node circle.state-end,.node circle.state-start,.start-state{fill:var(--md-mermaid-edge-color);stroke:none}.end-state-inner,.end-state-outer{fill:var(--md-mermaid-edge-color)}.end-state-inner,.node circle.state-end{stroke:var(--md-mermaid-label-bg-color)}.transition{stroke:var(--md-mermaid-edge-color)}[id^=state-fork] rect,[id^=state-join] rect{fill:var(--md-mermaid-edge-color)!important;stroke:none!important}.statediagram-cluster.statediagram-cluster .inner{fill:var(--md-default-bg-color)}.statediagram-cluster rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.statediagram-state rect.divider{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}defs #statediagram-barbEnd{stroke:var(--md-mermaid-edge-color)}.attributeBoxEven,.attributeBoxOdd{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityBox{fill:var(--md-mermaid-label-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityLabel{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.relationshipLabelBox{fill:var(--md-mermaid-label-bg-color);fill-opacity:1;background-color:var(--md-mermaid-label-bg-color);opacity:1}.relationshipLabel{fill:var(--md-mermaid-label-fg-color)}.relationshipLine{stroke:var(--md-mermaid-edge-color)}defs #ONE_OR_MORE_END *,defs #ONE_OR_MORE_START *,defs #ONLY_ONE_END *,defs #ONLY_ONE_START *,defs #ZERO_OR_MORE_END *,defs #ZERO_OR_MORE_START *,defs #ZERO_OR_ONE_END *,defs #ZERO_OR_ONE_START *{stroke:var(--md-mermaid-edge-color)!important}defs #ZERO_OR_MORE_END circle,defs #ZERO_OR_MORE_START circle{fill:var(--md-mermaid-label-bg-color)}.actor{fill:var(--md-mermaid-sequence-actor-bg-color);stroke:var(--md-mermaid-sequence-actor-border-color)}text.actor>tspan{fill:var(--md-mermaid-sequence-actor-fg-color);font-family:var(--md-mermaid-font-family)}line{stroke:var(--md-mermaid-sequence-actor-line-color)}.actor-man circle,.actor-man line{fill:var(--md-mermaid-sequence-actorman-bg-color);stroke:var(--md-mermaid-sequence-actorman-line-color)}.messageLine0,.messageLine1{stroke:var(--md-mermaid-sequence-message-line-color)}.note{fill:var(--md-mermaid-sequence-note-bg-color);stroke:var(--md-mermaid-sequence-note-border-color)}.loopText,.loopText>tspan,.messageText,.noteText>tspan{stroke:none;font-family:var(--md-mermaid-font-family)!important}.messageText{fill:var(--md-mermaid-sequence-message-fg-color)}.loopText,.loopText>tspan{fill:var(--md-mermaid-sequence-loop-fg-color)}.noteText>tspan{fill:var(--md-mermaid-sequence-note-fg-color)}#arrowhead path{fill:var(--md-mermaid-sequence-message-line-color);stroke:none}.loopLine{fill:var(--md-mermaid-sequence-loop-bg-color);stroke:var(--md-mermaid-sequence-loop-border-color)}.labelBox{fill:var(--md-mermaid-sequence-label-bg-color);stroke:none}.labelText,.labelText>span{fill:var(--md-mermaid-sequence-label-fg-color);font-family:var(--md-mermaid-font-family)}.sequenceNumber{fill:var(--md-mermaid-sequence-number-fg-color)}rect.rect{fill:var(--md-mermaid-sequence-box-bg-color);stroke:none}rect.rect+text.text{fill:var(--md-mermaid-sequence-box-fg-color)}defs #sequencenumber{fill:var(--md-mermaid-sequence-number-bg-color)!important}", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  map,\n  of,\n  shareReplay,\n  tap\n} from \"rxjs\"\n\nimport { watchScript } from \"~/browser\"\nimport { h } from \"~/utilities\"\n\nimport { Component } from \"../../_\"\n\nimport themeCSS from \"./index.css\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Mermaid diagram\n */\nexport interface Mermaid {}\n\n/* ----------------------------------------------------------------------------\n * Data\n * ------------------------------------------------------------------------- */\n\n/**\n * Mermaid instance observable\n */\nlet mermaid$: Observable<void>\n\n/**\n * Global sequence number for diagrams\n */\nlet sequence = 0\n\n/* ----------------------------------------------------------------------------\n * Helper functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Fetch Mermaid script\n *\n * @returns Mermaid scripts observable\n */\nfunction fetchScripts(): Observable<void> {\n  return typeof mermaid === \"undefined\" || mermaid instanceof Element\n    ? watchScript(\"https://unpkg.com/mermaid@10/dist/mermaid.min.js\")\n    : of(undefined)\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount Mermaid diagram\n *\n * @param el - Code block element\n *\n * @returns Mermaid diagram component observable\n */\nexport function mountMermaid(\n  el: HTMLElement\n): Observable<Component<Mermaid>> {\n  el.classList.remove(\"mermaid\") // Hack: mitigate https://bit.ly/3CiN6Du\n  mermaid$ ||= fetchScripts()\n    .pipe(\n      tap(() => mermaid.initialize({\n        startOnLoad: false,\n        themeCSS,\n        sequence: {\n          actorFontSize: \"16px\", // Hack: mitigate https://bit.ly/3y0NEi3\n          messageFontSize: \"16px\",\n          noteFontSize: \"16px\"\n        }\n      })),\n      map(() => undefined),\n      shareReplay(1)\n    )\n\n  /* Render diagram */\n  mermaid$.subscribe(async () => {\n    el.classList.add(\"mermaid\") // Hack: mitigate https://bit.ly/3CiN6Du\n    const id = `__mermaid_${sequence++}`\n\n    /* Create host element to replace code block */\n    const host = h(\"div\", { class: \"mermaid\" })\n    const text = el.textContent\n\n    /* Render and inject diagram */\n    const { svg, fn } = await mermaid.render(id, text)\n\n    /* Create a shadow root and inject diagram */\n    const shadow = host.attachShadow({ mode: \"closed\" })\n    shadow.innerHTML = svg\n\n    /* Replace code block with diagram and bind functions */\n    el.replaceWith(host)\n    fn?.(shadow)\n  })\n\n  /* Create and return component */\n  return mermaid$\n    .pipe(\n      map(() => ({ ref: el }))\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { Observable, of } from \"rxjs\"\n\nimport { renderTable } from \"~/templates\"\nimport { h } from \"~/utilities\"\n\nimport { Component } from \"../../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Data table\n */\nexport interface DataTable {}\n\n/* ----------------------------------------------------------------------------\n * Data\n * ------------------------------------------------------------------------- */\n\n/**\n * Sentinel for replacement\n */\nconst sentinel = h(\"table\")\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount data table\n *\n * This function wraps a data table in another scrollable container, so it can\n * be smoothly scrolled on smaller screen sizes and won't break the layout.\n *\n * @param el - Data table element\n *\n * @returns Data table component observable\n */\nexport function mountDataTable(\n  el: HTMLElement\n): Observable<Component<DataTable>> {\n  el.replaceWith(sentinel)\n  sentinel.replaceWith(renderTable(el))\n\n  /* Create and return component */\n  return of({ ref: el })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  animationFrameScheduler,\n  asyncScheduler,\n  auditTime,\n  combineLatest,\n  defer,\n  endWith,\n  filter,\n  finalize,\n  fromEvent,\n  ignoreElements,\n  map,\n  merge,\n  skip,\n  startWith,\n  subscribeOn,\n  takeUntil,\n  tap,\n  withLatestFrom\n} from \"rxjs\"\n\nimport { feature } from \"~/_\"\nimport {\n  Viewport,\n  getElement,\n  getElementContentOffset,\n  getElementContentSize,\n  getElementOffset,\n  getElementSize,\n  getElements,\n  watchElementContentOffset,\n  watchElementSize,\n  watchElementVisibility\n} from \"~/browser\"\nimport { renderTabbedControl } from \"~/templates\"\nimport { h } from \"~/utilities\"\n\nimport { Component } from \"../../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Content tabs\n */\nexport interface ContentTabs {\n  active: HTMLLabelElement             /* Active tab label */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  target$: Observable<HTMLElement>     /* Location target observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch content tabs\n *\n * @param inputs - Content tabs input elements\n *\n * @returns Content tabs observable\n */\nexport function watchContentTabs(\n  inputs: HTMLInputElement[]\n): Observable<ContentTabs> {\n  const initial = inputs.find(input => input.checked) || inputs[0]\n  return merge(...inputs.map(input => fromEvent(input, \"change\")\n    .pipe(\n      map(() => getElement<HTMLLabelElement>(`label[for=\"${input.id}\"]`))\n    )\n  ))\n    .pipe(\n      startWith(getElement<HTMLLabelElement>(`label[for=\"${initial.id}\"]`)),\n      map(active => ({ active }))\n    )\n}\n\n/**\n * Mount content tabs\n *\n * @param el - Content tabs element\n * @param options - Options\n *\n * @returns Content tabs component observable\n */\nexport function mountContentTabs(\n  el: HTMLElement, { viewport$, target$ }: MountOptions\n): Observable<Component<ContentTabs>> {\n  const container = getElement(\".tabbed-labels\", el)\n  const inputs = getElements<HTMLInputElement>(\":scope > input\", el)\n\n  /* Render content tab previous button for pagination */\n  const prev = renderTabbedControl(\"prev\")\n  el.append(prev)\n\n  /* Render content tab next button for pagination */\n  const next = renderTabbedControl(\"next\")\n  el.append(next)\n\n  /* Mount component on subscription */\n  return defer(() => {\n    const push$ = new Subject<ContentTabs>()\n    const done$ = push$.pipe(ignoreElements(), endWith(true))\n    combineLatest([push$, watchElementSize(el), watchElementVisibility(el)])\n      .pipe(\n        takeUntil(done$),\n        auditTime(1, animationFrameScheduler)\n      )\n        .subscribe({\n\n          /* Handle emission */\n          next([{ active }, size]) {\n            const offset = getElementOffset(active)\n            const { width } = getElementSize(active)\n\n            /* Set tab indicator offset and width */\n            el.style.setProperty(\"--md-indicator-x\", `${offset.x}px`)\n            el.style.setProperty(\"--md-indicator-width\", `${width}px`)\n\n            /* Scroll container to active content tab */\n            const content = getElementContentOffset(container)\n            if (\n              offset.x         < content.x              ||\n              offset.x + width > content.x + size.width\n            )\n              container.scrollTo({\n                left: Math.max(0, offset.x - 16),\n                behavior: \"smooth\"\n              })\n          },\n\n          /* Handle complete */\n          complete() {\n            el.style.removeProperty(\"--md-indicator-x\")\n            el.style.removeProperty(\"--md-indicator-width\")\n          }\n        })\n\n    /* Hide content tab buttons on borders */\n    combineLatest([\n      watchElementContentOffset(container),\n      watchElementSize(container)\n    ])\n      .pipe(\n        takeUntil(done$)\n      )\n        .subscribe(([offset, size]) => {\n          const content = getElementContentSize(container)\n          prev.hidden = offset.x < 16\n          next.hidden = offset.x > content.width - size.width - 16\n        })\n\n    /* Paginate content tab container on click */\n    merge(\n      fromEvent(prev, \"click\").pipe(map(() => -1)),\n      fromEvent(next, \"click\").pipe(map(() => +1))\n    )\n      .pipe(\n        takeUntil(done$)\n      )\n        .subscribe(direction => {\n          const { width } = getElementSize(container)\n          container.scrollBy({\n            left: width * direction,\n            behavior: \"smooth\"\n          })\n        })\n\n    /* Switch to content tab target */\n    target$\n      .pipe(\n        takeUntil(done$),\n        filter(input => inputs.includes(input as HTMLInputElement))\n      )\n        .subscribe(input => input.click())\n\n    /* Add link to each content tab label */\n    container.classList.add(\"tabbed-labels--linked\")\n    for (const input of inputs) {\n      const label = getElement<HTMLLabelElement>(`label[for=\"${input.id}\"]`)\n      label.replaceChildren(h(\"a\", {\n        href: `#${label.htmlFor}`,\n        tabIndex: -1\n      }, ...Array.from(label.childNodes)))\n\n      /* Allow to copy link without scrolling to anchor */\n      fromEvent<MouseEvent>(label.firstElementChild!, \"click\")\n        .pipe(\n          takeUntil(done$),\n          filter(ev => !(ev.metaKey || ev.ctrlKey)),\n          tap(ev => {\n            ev.preventDefault()\n            ev.stopPropagation()\n          })\n        )\n          // @todo we might need to remove the anchor link on complete\n          .subscribe(() => {\n            history.replaceState({}, \"\", `#${label.htmlFor}`)\n            label.click()\n          })\n    }\n\n    /* Set up linking of content tabs, if enabled */\n    if (feature(\"content.tabs.link\"))\n      push$.pipe(\n        skip(1),\n        withLatestFrom(viewport$)\n      )\n        .subscribe(([{ active }, { offset }]) => {\n          const tab = active.innerText.trim()\n          if (active.hasAttribute(\"data-md-switching\")) {\n            active.removeAttribute(\"data-md-switching\")\n\n          /* Determine viewport offset of active tab */\n          } else {\n            const y = el.offsetTop - offset.y\n\n            /* Passively activate other tabs */\n            for (const set of getElements(\"[data-tabs]\"))\n              for (const input of getElements<HTMLInputElement>(\n                \":scope > input\", set\n              )) {\n                const label = getElement(`label[for=\"${input.id}\"]`)\n                if (\n                  label !== active &&\n                  label.innerText.trim() === tab\n                ) {\n                  label.setAttribute(\"data-md-switching\", \"\")\n                  input.click()\n                  break\n                }\n              }\n\n            /* Bring active tab into view */\n            window.scrollTo({\n              top: el.offsetTop - y\n            })\n\n            /* Persist active tabs in local storage */\n            const tabs = __md_get<string[]>(\"__tabs\") || []\n            __md_set(\"__tabs\", [...new Set([tab, ...tabs])])\n          }\n        })\n\n    /* Pause media (audio, video) on switch - see https://bit.ly/3Bk6cel */\n    push$.pipe(takeUntil(done$))\n      .subscribe(() => {\n        for (const media of getElements<HTMLAudioElement>(\"audio, video\", el))\n          media.pause()\n      })\n\n    /* Create and return component */\n    return watchContentTabs(inputs)\n      .pipe(\n        tap(state => push$.next(state)),\n        finalize(() => push$.complete()),\n        map(state => ({ ref: el, ...state }))\n      )\n  })\n    .pipe(\n      subscribeOn(asyncScheduler)\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { Observable, merge } from \"rxjs\"\n\nimport { feature } from \"~/_\"\nimport { Viewport, getElements } from \"~/browser\"\n\nimport { Component } from \"../../_\"\nimport {\n  Tooltip,\n  mountInlineTooltip2\n} from \"../../tooltip2\"\nimport {\n  Annotation,\n  mountAnnotationBlock\n} from \"../annotation\"\nimport {\n  CodeBlock,\n  mountCodeBlock\n} from \"../code\"\nimport {\n  Details,\n  mountDetails\n} from \"../details\"\nimport {\n  Mermaid,\n  mountMermaid\n} from \"../mermaid\"\nimport {\n  DataTable,\n  mountDataTable\n} from \"../table\"\nimport {\n  ContentTabs,\n  mountContentTabs\n} from \"../tabs\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Content\n */\nexport type Content =\n  | Annotation\n  | CodeBlock\n  | ContentTabs\n  | DataTable\n  | Details\n  | Mermaid\n  | Tooltip\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  target$: Observable<HTMLElement>     /* Location target observable */\n  print$: Observable<boolean>          /* Media print observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount content\n *\n * This function mounts all components that are found in the content of the\n * actual article, including code blocks, data tables and details.\n *\n * @param el - Content element\n * @param options - Options\n *\n * @returns Content component observable\n */\nexport function mountContent(\n  el: HTMLElement, { viewport$, target$, print$ }: MountOptions\n): Observable<Component<Content>> {\n  return merge(\n\n    /* Annotations */\n    ...getElements(\".annotate:not(.highlight)\", el)\n      .map(child => mountAnnotationBlock(child, { target$, print$ })),\n\n    /* Code blocks */\n    ...getElements(\"pre:not(.mermaid) > code\", el)\n      .map(child => mountCodeBlock(child, { target$, print$ })),\n\n    /* Mermaid diagrams */\n    ...getElements(\"pre.mermaid\", el)\n      .map(child => mountMermaid(child)),\n\n    /* Data tables */\n    ...getElements(\"table:not([class])\", el)\n      .map(child => mountDataTable(child)),\n\n    /* Details */\n    ...getElements(\"details\", el)\n      .map(child => mountDetails(child, { target$, print$ })),\n\n    /* Content tabs */\n    ...getElements(\"[data-tabs]\", el)\n      .map(child => mountContentTabs(child, { viewport$, target$ })),\n\n    /* Tooltips */\n    ...getElements(\"[title]\", el)\n      .filter(() => feature(\"content.tooltips\"))\n      .map(child => mountInlineTooltip2(child, { viewport$ }))\n  )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  defer,\n  delay,\n  finalize,\n  map,\n  merge,\n  of,\n  switchMap,\n  tap\n} from \"rxjs\"\n\nimport { getElement } from \"~/browser\"\n\nimport { Component } from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Dialog\n */\nexport interface Dialog {\n  message: string                      /* Dialog message */\n  active: boolean                      /* Dialog is active */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch options\n */\ninterface WatchOptions {\n  alert$: Subject<string>              /* Alert subject */\n}\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  alert$: Subject<string>              /* Alert subject */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch dialog\n *\n * @param _el - Dialog element\n * @param options - Options\n *\n * @returns Dialog observable\n */\nexport function watchDialog(\n  _el: HTMLElement, { alert$ }: WatchOptions\n): Observable<Dialog> {\n  return alert$\n    .pipe(\n      switchMap(message => merge(\n        of(true),\n        of(false).pipe(delay(2000))\n      )\n        .pipe(\n          map(active => ({ message, active }))\n        )\n      )\n    )\n}\n\n/**\n * Mount dialog\n *\n * This function reveals the dialog in the right corner when a new alert is\n * emitted through the subject that is passed as part of the options.\n *\n * @param el - Dialog element\n * @param options - Options\n *\n * @returns Dialog component observable\n */\nexport function mountDialog(\n  el: HTMLElement, options: MountOptions\n): Observable<Component<Dialog>> {\n  const inner = getElement(\".md-typeset\", el)\n  return defer(() => {\n    const push$ = new Subject<Dialog>()\n    push$.subscribe(({ message, active }) => {\n      el.classList.toggle(\"md-dialog--active\", active)\n      inner.textContent = message\n    })\n\n    /* Create and return component */\n    return watchDialog(el, options)\n      .pipe(\n        tap(state => push$.next(state)),\n        finalize(() => push$.complete()),\n        map(state => ({ ref: el, ...state }))\n      )\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  EMPTY,\n  Observable,\n  Subject,\n  animationFrameScheduler,\n  asyncScheduler,\n  auditTime,\n  combineLatest,\n  debounceTime,\n  defer,\n  distinctUntilChanged,\n  filter,\n  finalize,\n  map,\n  merge,\n  of,\n  subscribeOn,\n  tap,\n  throttleTime\n} from \"rxjs\"\n\nimport {\n  ElementOffset,\n  getElement,\n  getElementContainer,\n  getElementOffset,\n  getElementSize,\n  watchElementContentOffset,\n  watchElementFocus,\n  watchElementHover\n} from \"~/browser\"\nimport { renderTooltip } from \"~/templates\"\n\nimport { Component } from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Tooltip\n */\nexport interface Tooltip {\n  active: boolean                      /* Tooltip is active */\n  offset: ElementOffset                /* Tooltip offset */\n}\n\n/* ----------------------------------------------------------------------------\n * Data\n * ------------------------------------------------------------------------- */\n\n/**\n * Global sequence number for tooltips\n */\nlet sequence = 0\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch tooltip\n *\n * This function will append the tooltip temporarily to compute its width,\n * which is necessary for correct centering, and then removing it again.\n *\n * @param el - Tooltip element\n * @param host - Host element\n *\n * @returns Tooltip observable\n */\nexport function watchTooltip(\n  el: HTMLElement, host: HTMLElement\n): Observable<Tooltip> {\n  document.body.append(el)\n\n  /* Compute width and remove tooltip immediately */\n  const { width } = getElementSize(el)\n  el.style.setProperty(\"--md-tooltip-width\", `${width}px`)\n  el.remove()\n\n  /* Retrieve and watch containing element */\n  const container = getElementContainer(host)\n  const scroll$ =\n    typeof container !== \"undefined\"\n      ? watchElementContentOffset(container)\n      : of({ x: 0, y: 0 })\n\n  /* Compute tooltip visibility */\n  const active$ = merge(\n    watchElementFocus(host),\n    watchElementHover(host)\n  )\n    .pipe(\n      distinctUntilChanged()\n    )\n\n  /* Compute tooltip offset */\n  return combineLatest([active$, scroll$])\n    .pipe(\n      map(([active, scroll]) => {\n        let { x, y } = getElementOffset(host)\n        const size = getElementSize(host)\n\n        /**\n         * Experimental: fix handling of tables - see https://bit.ly/3TQEj5O\n         *\n         * If this proves to be a viable fix, we should refactor tooltip\n         * positioning and somehow streamline the current process. This might\n         * also fix positioning for annotations inside tables, which is another\n         * limitation.\n         */\n        const table = host.closest(\"table\")\n        if (table && host.parentElement) {\n          x += table.offsetLeft + host.parentElement.offsetLeft\n          y += table.offsetTop  + host.parentElement.offsetTop\n        }\n        return {\n          active,\n          offset: {\n            x: x - scroll.x + size.width  / 2 - width / 2,\n            y: y - scroll.y + size.height + 8\n          }\n        }\n      })\n    )\n}\n\n/**\n * Mount tooltip\n *\n * @param el - Host element\n *\n * @returns Tooltip component observable\n */\nexport function mountTooltip(\n  el: HTMLElement\n): Observable<Component<Tooltip>> {\n  const title = el.title\n  if (!title.length)\n    return EMPTY\n\n  /* Render tooltip and set title from host element */\n  const id = `__tooltip_${sequence++}`\n  const tooltip = renderTooltip(id, \"inline\")\n  const typeset = getElement(\".md-typeset\", tooltip)\n  typeset.innerHTML = title\n\n  /* Mount component on subscription */\n  return defer(() => {\n    const push$ = new Subject<Tooltip>()\n    push$.subscribe({\n\n      /* Handle emission */\n      next({ offset }) {\n        tooltip.style.setProperty(\"--md-tooltip-x\", `${offset.x}px`)\n        tooltip.style.setProperty(\"--md-tooltip-y\", `${offset.y}px`)\n      },\n\n      /* Handle complete */\n      complete() {\n        tooltip.style.removeProperty(\"--md-tooltip-x\")\n        tooltip.style.removeProperty(\"--md-tooltip-y\")\n      }\n    })\n\n    /* Toggle tooltip presence to mitigate empty lines when copying */\n    merge(\n      push$.pipe(filter(({ active }) => active)),\n      push$.pipe(debounceTime(250), filter(({ active }) => !active))\n    )\n      .subscribe({\n\n        /* Handle emission */\n        next({ active }) {\n          if (active) {\n            el.insertAdjacentElement(\"afterend\", tooltip)\n            el.setAttribute(\"aria-describedby\", id)\n            el.removeAttribute(\"title\")\n          } else {\n            tooltip.remove()\n            el.removeAttribute(\"aria-describedby\")\n            el.setAttribute(\"title\", title)\n          }\n        },\n\n        /* Handle complete */\n        complete() {\n          tooltip.remove()\n          el.removeAttribute(\"aria-describedby\")\n          el.setAttribute(\"title\", title)\n        }\n      })\n\n    /* Toggle tooltip visibility */\n    push$\n      .pipe(\n        auditTime(16, animationFrameScheduler)\n      )\n        .subscribe(({ active }) => {\n          tooltip.classList.toggle(\"md-tooltip--active\", active)\n        })\n\n    // @todo - refactor positioning together with annotations \u2013 there are\n    // several things that overlap and are identical in handling\n\n    /* Track relative origin of tooltip */\n    push$\n      .pipe(\n        throttleTime(125, animationFrameScheduler),\n        filter(() => !!el.offsetParent),\n        map(() => el.offsetParent!.getBoundingClientRect()),\n        map(({ x }) => x)\n      )\n      .subscribe({\n\n        /* Handle emission */\n        next(origin) {\n          if (origin)\n            tooltip.style.setProperty(\"--md-tooltip-0\", `${-origin}px`)\n          else\n            tooltip.style.removeProperty(\"--md-tooltip-0\")\n        },\n\n        /* Handle complete */\n        complete() {\n          tooltip.style.removeProperty(\"--md-tooltip-0\")\n        }\n      })\n\n    /* Create and return component */\n    return watchTooltip(tooltip, el)\n      .pipe(\n        tap(state => push$.next(state)),\n        finalize(() => push$.complete()),\n        map(state => ({ ref: el, ...state }))\n      )\n  })\n    .pipe(\n      subscribeOn(asyncScheduler)\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  bufferCount,\n  combineLatest,\n  combineLatestWith,\n  defer,\n  distinctUntilChanged,\n  distinctUntilKeyChanged,\n  endWith,\n  filter,\n  from,\n  ignoreElements,\n  map,\n  mergeMap,\n  mergeWith,\n  of,\n  shareReplay,\n  startWith,\n  switchMap,\n  takeUntil\n} from \"rxjs\"\n\nimport { feature } from \"~/_\"\nimport {\n  Viewport,\n  getElements,\n  watchElementSize,\n  watchToggle\n} from \"~/browser\"\n\nimport { Component } from \"../../_\"\nimport { Main } from \"../../main\"\nimport {\n  Tooltip,\n  mountTooltip\n} from \"../../tooltip\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Header\n */\nexport interface Header {\n  height: number                       /* Header visible height */\n  hidden: boolean                      /* Header is hidden */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch options\n */\ninterface WatchOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n}\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  header$: Observable<Header>          /* Header observable */\n  main$: Observable<Main>              /* Main area observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Compute whether the header is hidden\n *\n * If the user scrolls past a certain threshold, the header can be hidden when\n * scrolling down, and shown when scrolling up.\n *\n * @param options - Options\n *\n * @returns Toggle observable\n */\nfunction isHidden({ viewport$ }: WatchOptions): Observable<boolean> {\n  if (!feature(\"header.autohide\"))\n    return of(false)\n\n  /* Compute direction and turning point */\n  const direction$ = viewport$\n    .pipe(\n      map(({ offset: { y } }) => y),\n      bufferCount(2, 1),\n      map(([a, b]) => [a < b, b] as const),\n      distinctUntilKeyChanged(0)\n    )\n\n  /* Compute whether header should be hidden */\n  const hidden$ = combineLatest([viewport$, direction$])\n    .pipe(\n      filter(([{ offset }, [, y]]) => Math.abs(y - offset.y) > 100),\n      map(([, [direction]]) => direction),\n      distinctUntilChanged()\n    )\n\n  /* Compute threshold for hiding */\n  const search$ = watchToggle(\"search\")\n  return combineLatest([viewport$, search$])\n    .pipe(\n      map(([{ offset }, search]) => offset.y > 400 && !search),\n      distinctUntilChanged(),\n      switchMap(active => active ? hidden$ : of(false)),\n      startWith(false)\n    )\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch header\n *\n * @param el - Header element\n * @param options - Options\n *\n * @returns Header observable\n */\nexport function watchHeader(\n  el: HTMLElement, options: WatchOptions\n): Observable<Header> {\n  return defer(() => combineLatest([\n    watchElementSize(el),\n    isHidden(options)\n  ]))\n    .pipe(\n      map(([{ height }, hidden]) => ({\n        height,\n        hidden\n      })),\n      distinctUntilChanged((a, b) => (\n        a.height === b.height &&\n        a.hidden === b.hidden\n      )),\n      shareReplay(1)\n    )\n}\n\n/**\n * Mount header\n *\n * This function manages the different states of the header, i.e. whether it's\n * hidden or rendered with a shadow. This depends heavily on the main area.\n *\n * @param el - Header element\n * @param options - Options\n *\n * @returns Header component observable\n */\nexport function mountHeader(\n  el: HTMLElement, { header$, main$ }: MountOptions\n): Observable<Component<Header | Tooltip>> {\n  return defer(() => {\n    const push$ = new Subject<Main>()\n    const done$ = push$.pipe(ignoreElements(), endWith(true))\n    push$\n      .pipe(\n        distinctUntilKeyChanged(\"active\"),\n        combineLatestWith(header$)\n      )\n        .subscribe(([{ active }, { hidden }]) => {\n          el.classList.toggle(\"md-header--shadow\", active && !hidden)\n          el.hidden = hidden\n        })\n\n    /* Mount tooltips, if enabled */\n    const tooltips = from(getElements(\"[title]\", el))\n      .pipe(\n        filter(() => feature(\"content.tooltips\")),\n        mergeMap(child => mountTooltip(child))\n      )\n\n    /* Link to main area */\n    main$.subscribe(push$)\n\n    /* Create and return component */\n    return header$\n      .pipe(\n        takeUntil(done$),\n        map(state => ({ ref: el, ...state })),\n        mergeWith(tooltips.pipe(takeUntil(done$)))\n      )\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  EMPTY,\n  Observable,\n  Subject,\n  defer,\n  distinctUntilKeyChanged,\n  finalize,\n  map,\n  tap\n} from \"rxjs\"\n\nimport {\n  Viewport,\n  getElementSize,\n  getOptionalElement,\n  watchViewportAt\n} from \"~/browser\"\n\nimport { Component } from \"../../_\"\nimport { Header } from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Header\n */\nexport interface HeaderTitle {\n  active: boolean                      /* Header title is active */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch options\n */\ninterface WatchOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  header$: Observable<Header>          /* Header observable */\n}\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  header$: Observable<Header>          /* Header observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch header title\n *\n * @param el - Heading element\n * @param options - Options\n *\n * @returns Header title observable\n */\nexport function watchHeaderTitle(\n  el: HTMLElement, { viewport$, header$ }: WatchOptions\n): Observable<HeaderTitle> {\n  return watchViewportAt(el, { viewport$, header$ })\n    .pipe(\n      map(({ offset: { y } }) => {\n        const { height } = getElementSize(el)\n        return {\n          active: y >= height\n        }\n      }),\n      distinctUntilKeyChanged(\"active\")\n    )\n}\n\n/**\n * Mount header title\n *\n * This function swaps the header title from the site title to the title of the\n * current page when the user scrolls past the first headline.\n *\n * @param el - Header title element\n * @param options - Options\n *\n * @returns Header title component observable\n */\nexport function mountHeaderTitle(\n  el: HTMLElement, options: MountOptions\n): Observable<Component<HeaderTitle>> {\n  return defer(() => {\n    const push$ = new Subject<HeaderTitle>()\n    push$.subscribe({\n\n      /* Handle emission */\n      next({ active }) {\n        el.classList.toggle(\"md-header__title--active\", active)\n      },\n\n      /* Handle complete */\n      complete() {\n        el.classList.remove(\"md-header__title--active\")\n      }\n    })\n\n    /* Obtain headline, if any */\n    const heading = getOptionalElement(\".md-content h1\")\n    if (typeof heading === \"undefined\")\n      return EMPTY\n\n    /* Create and return component */\n    return watchHeaderTitle(heading, options)\n      .pipe(\n        tap(state => push$.next(state)),\n        finalize(() => push$.complete()),\n        map(state => ({ ref: el, ...state }))\n      )\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  combineLatest,\n  distinctUntilChanged,\n  distinctUntilKeyChanged,\n  map,\n  switchMap\n} from \"rxjs\"\n\nimport {\n  Viewport,\n  watchElementSize\n} from \"~/browser\"\n\nimport { Header } from \"../header\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Main area\n */\nexport interface Main {\n  offset: number                       /* Main area top offset */\n  height: number                       /* Main area visible height */\n  active: boolean                      /* Main area is active */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch options\n */\ninterface WatchOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  header$: Observable<Header>          /* Header observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch main area\n *\n * This function returns an observable that computes the visual parameters of\n * the main area which depends on the viewport vertical offset and height, as\n * well as the height of the header element, if the header is fixed.\n *\n * @param el - Main area element\n * @param options - Options\n *\n * @returns Main area observable\n */\nexport function watchMain(\n  el: HTMLElement, { viewport$, header$ }: WatchOptions\n): Observable<Main> {\n\n  /* Compute necessary adjustment for header */\n  const adjust$ = header$\n    .pipe(\n      map(({ height }) => height),\n      distinctUntilChanged()\n    )\n\n  /* Compute the main area's top and bottom borders */\n  const border$ = adjust$\n    .pipe(\n      switchMap(() => watchElementSize(el)\n        .pipe(\n          map(({ height }) => ({\n            top:    el.offsetTop,\n            bottom: el.offsetTop + height\n          })),\n          distinctUntilKeyChanged(\"bottom\")\n        )\n      )\n    )\n\n  /* Compute the main area's offset, visible height and if we scrolled past */\n  return combineLatest([adjust$, border$, viewport$])\n    .pipe(\n      map(([header, { top, bottom }, { offset: { y }, size: { height } }]) => {\n        height = Math.max(0, height\n          - Math.max(0, top    - y,  header)\n          - Math.max(0, height + y - bottom)\n        )\n        return {\n          offset: top - header,\n          height,\n          active: top - header <= y\n        }\n      }),\n      distinctUntilChanged((a, b) => (\n        a.offset === b.offset &&\n        a.height === b.height &&\n        a.active === b.active\n      ))\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  asyncScheduler,\n  defer,\n  filter,\n  finalize,\n  fromEvent,\n  map,\n  mergeMap,\n  observeOn,\n  of,\n  repeat,\n  shareReplay,\n  skip,\n  startWith,\n  takeUntil,\n  tap,\n  withLatestFrom\n} from \"rxjs\"\n\nimport { getElements, watchMedia } from \"~/browser\"\nimport { h } from \"~/utilities\"\n\nimport {\n  Component,\n  getComponentElement\n} from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Palette colors\n */\nexport interface PaletteColor {\n  media?: string                       /* Media query */\n  scheme?: string                      /* Color scheme */\n  primary?: string                     /* Primary color */\n  accent?: string                      /* Accent color */\n}\n\n/**\n * Palette\n */\nexport interface Palette {\n  index: number                        /* Palette index */\n  color: PaletteColor                  /* Palette colors */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch color palette\n *\n * @param inputs - Color palette element\n *\n * @returns Color palette observable\n */\nexport function watchPalette(\n  inputs: HTMLInputElement[]\n): Observable<Palette> {\n  const current = __md_get<Palette>(\"__palette\") || {\n    index: inputs.findIndex(input => matchMedia(\n      input.getAttribute(\"data-md-color-media\")!\n    ).matches)\n  }\n\n  /* Emit changes in color palette */\n  const index = Math.max(0, Math.min(current.index, inputs.length - 1))\n  return of(...inputs)\n    .pipe(\n      mergeMap(input => fromEvent(input, \"change\").pipe(map(() => input))),\n      startWith(inputs[index]),\n      map(input => ({\n        index: inputs.indexOf(input),\n        color: {\n          media:   input.getAttribute(\"data-md-color-media\"),\n          scheme:  input.getAttribute(\"data-md-color-scheme\"),\n          primary: input.getAttribute(\"data-md-color-primary\"),\n          accent:  input.getAttribute(\"data-md-color-accent\")\n        }\n      } as Palette)),\n      shareReplay(1)\n    )\n}\n\n/**\n * Mount color palette\n *\n * @param el - Color palette element\n *\n * @returns Color palette component observable\n */\nexport function mountPalette(\n  el: HTMLElement\n): Observable<Component<Palette>> {\n  const inputs = getElements<HTMLInputElement>(\"input\", el)\n  const meta = h(\"meta\", { name: \"theme-color\" })\n  document.head.appendChild(meta)\n\n  // Add color scheme meta tag\n  const scheme = h(\"meta\", { name: \"color-scheme\" })\n  document.head.appendChild(scheme)\n\n  /* Mount component on subscription */\n  const media$ = watchMedia(\"(prefers-color-scheme: light)\")\n  return defer(() => {\n    const push$ = new Subject<Palette>()\n    push$.subscribe(palette => {\n      document.body.setAttribute(\"data-md-color-switching\", \"\")\n\n      /* Retrieve color palette for system preference */\n      if (palette.color.media === \"(prefers-color-scheme)\") {\n        const media = matchMedia(\"(prefers-color-scheme: light)\")\n        const input = document.querySelector(media.matches\n          ? \"[data-md-color-media='(prefers-color-scheme: light)']\"\n          : \"[data-md-color-media='(prefers-color-scheme: dark)']\"\n        )!\n\n        /* Retrieve colors for system preference */\n        palette.color.scheme  = input.getAttribute(\"data-md-color-scheme\")!\n        palette.color.primary = input.getAttribute(\"data-md-color-primary\")!\n        palette.color.accent  = input.getAttribute(\"data-md-color-accent\")!\n      }\n\n      /* Set color palette */\n      for (const [key, value] of Object.entries(palette.color))\n        document.body.setAttribute(`data-md-color-${key}`, value)\n\n      /* Set toggle visibility */\n      for (let index = 0; index < inputs.length; index++) {\n        const label = inputs[index].nextElementSibling\n        if (label instanceof HTMLElement)\n          label.hidden = palette.index !== index\n      }\n\n      /* Persist preference in local storage */\n      __md_set(\"__palette\", palette)\n    })\n\n    // Handle color switch on Enter or Space - see https://t.ly/YIhVj\n    fromEvent<KeyboardEvent>(el, \"keydown\").pipe(\n      filter(ev => ev.key === \"Enter\"),\n      withLatestFrom(push$, (_, palette) => palette)\n    )\n      .subscribe(({ index }) => {\n        index = (index + 1) % inputs.length\n        inputs[index].click()\n        inputs[index].focus()\n      })\n\n    /* Update theme-color meta tag */\n    push$\n      .pipe(\n        map(() => {\n          const header = getComponentElement(\"header\")\n          const style  = window.getComputedStyle(header)\n\n          // Set color scheme\n          scheme.content = style.colorScheme\n\n          /* Return color in hexadecimal format */\n          return style.backgroundColor.match(/\\d+/g)!\n            .map(value => (+value).toString(16).padStart(2, \"0\"))\n            .join(\"\")\n        })\n      )\n        .subscribe(color => meta.content = `#${color}`)\n\n    /* Revert transition durations after color switch */\n    push$.pipe(observeOn(asyncScheduler))\n      .subscribe(() => {\n        document.body.removeAttribute(\"data-md-color-switching\")\n      })\n\n    /* Create and return component */\n    return watchPalette(inputs)\n      .pipe(\n        takeUntil(media$.pipe(skip(1))),\n        repeat(),\n        tap(state => push$.next(state)),\n        finalize(() => push$.complete()),\n        map(state => ({ ref: el, ...state }))\n      )\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  defer,\n  finalize,\n  map,\n  tap\n} from \"rxjs\"\n\nimport { Component } from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Progress indicator\n */\nexport interface Progress {\n  value: number                        // Progress value\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  progress$: Subject<number>           // Progress subject\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount progress indicator\n *\n * @param el - Progress indicator element\n * @param options - Options\n *\n * @returns Progress indicator component observable\n */\nexport function mountProgress(\n  el: HTMLElement, { progress$ }: MountOptions\n): Observable<Component<Progress>> {\n\n  // Mount component on subscription\n  return defer(() => {\n    const push$ = new Subject<Progress>()\n    push$.subscribe(({ value }) => {\n      el.style.setProperty(\"--md-progress-value\", `${value}`)\n    })\n\n    // Create and return component\n    return progress$\n      .pipe(\n        tap(value => push$.next({ value })),\n        finalize(() => push$.complete()),\n        map(value => ({ ref: el, value }))\n      )\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport ClipboardJS from \"clipboard\"\nimport {\n  Observable,\n  Subject,\n  map,\n  tap\n} from \"rxjs\"\n\nimport { translation } from \"~/_\"\nimport { getElement } from \"~/browser\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Setup options\n */\ninterface SetupOptions {\n  alert$: Subject<string>              /* Alert subject */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Extract text to copy\n *\n * @param el - HTML element\n *\n * @returns Extracted text\n */\nfunction extract(el: HTMLElement): string {\n  el.setAttribute(\"data-md-copying\", \"\")\n  const copy = el.closest(\"[data-copy]\")\n  const text = copy\n    ? copy.getAttribute(\"data-copy\")!\n    : el.innerText\n  el.removeAttribute(\"data-md-copying\")\n  return text.trimEnd()\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Set up Clipboard.js integration\n *\n * @param options - Options\n */\nexport function setupClipboardJS(\n  { alert$ }: SetupOptions\n): void {\n  if (ClipboardJS.isSupported()) {\n    new Observable<ClipboardJS.Event>(subscriber => {\n      new ClipboardJS(\"[data-clipboard-target], [data-clipboard-text]\", {\n        text: el => (\n          el.getAttribute(\"data-clipboard-text\")! ||\n          extract(getElement(\n            el.getAttribute(\"data-clipboard-target\")!\n          ))\n        )\n      })\n        .on(\"success\", ev => subscriber.next(ev))\n    })\n      .pipe(\n        tap(ev => {\n          const trigger = ev.trigger as HTMLElement\n          trigger.focus()\n        }),\n        map(() => translation(\"clipboard.copied\"))\n      )\n        .subscribe(alert$)\n  }\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  catchError,\n  map,\n  of\n} from \"rxjs\"\n\nimport {\n  getElement,\n  getElements,\n  requestXML\n} from \"~/browser\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Sitemap, i.e. a list of URLs\n */\nexport type Sitemap = Map<string, URL[]>\n\n/* ----------------------------------------------------------------------------\n * Helper functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Resolve URL to the given base URL\n *\n * When serving the site with instant navigation, MkDocs will set the hostname\n * to the value as specified in `dev_addr`, but the browser allows for several\n * hostnames to be used: `localhost`, `127.0.0.1` or even `0.0.0.0`, depending\n * on configuration. This function resolves the URL to the given hostname.\n *\n * @param url - URL\n * @param base - Base URL\n *\n * @returns Resolved URL\n */\nfunction resolve(url: URL, base: URL) {\n  url.protocol = base.protocol\n  url.hostname = base.hostname\n  return url\n}\n\n/**\n * Extract sitemap from document\n *\n * This function extracts the URLs and alternate links from the document, and\n * associates alternate links to the original URL as found in `loc`, allowing\n * the browser to navigate to the correct page when switching languages. The\n * format of the sitemap is expected to adhere to:\n *\n * ``` xml\n * <urlset>\n *   <url>\n *     <loc>...</loc>\n *     <xhtml:link rel=\"alternate\" hreflang=\"en\" href=\"...\"/>\n *     <xhtml:link rel=\"alternate\" hreflang=\"de\" href=\"...\"/>\n *     ...\n *   </url>\n *   ...\n * </urlset>\n * ```\n *\n * @param document - Document\n * @param base - Base URL\n *\n * @returns Sitemap\n */\nfunction extract(document: Document, base: URL): Sitemap {\n  const sitemap: Sitemap = new Map()\n  for (const el of getElements(\"url\", document)) {\n    const url = getElement(\"loc\", el)\n\n    // Create entry for location and add it to the list of links\n    const links = [resolve(new URL(url.textContent!), base)]\n    sitemap.set(`${links[0]}`, links)\n\n    // Attach alternate links to current entry\n    for (const link of getElements(\"[rel=alternate]\", el)) {\n      const href = link.getAttribute(\"href\")\n      if (href != null)\n        links.push(resolve(new URL(href), base))\n    }\n  }\n\n  // Return sitemap\n  return sitemap\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Fetch the sitemap for the given base URL\n *\n * If a network or parsing error occurs, we just default to an empty sitemap,\n * which means the caller should fall back to regular navigation.\n *\n * @param base - Base URL\n *\n * @returns Sitemap observable\n */\nexport function fetchSitemap(base: URL | string): Observable<Sitemap> {\n  return requestXML(new URL(\"sitemap.xml\", base))\n    .pipe(\n      map(document => extract(document, new URL(base))),\n      catchError(() => of(new Map())),\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  EMPTY,\n  Observable,\n  Subject,\n  catchError,\n  combineLatestWith,\n  concat,\n  debounceTime,\n  distinctUntilChanged,\n  distinctUntilKeyChanged,\n  endWith,\n  fromEvent,\n  ignoreElements,\n  map,\n  merge,\n  of,\n  share,\n  switchMap,\n  tap,\n  withLatestFrom\n} from \"rxjs\"\n\nimport { configuration, feature } from \"~/_\"\nimport {\n  Viewport,\n  getElements,\n  getLocation,\n  getOptionalElement,\n  requestHTML,\n  setLocation,\n  setLocationHash\n} from \"~/browser\"\nimport { getComponentElement } from \"~/components\"\n\nimport { Sitemap, fetchSitemap } from \"../sitemap\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Setup options\n */\ninterface SetupOptions {\n  location$: Subject<URL>              // Location subject\n  viewport$: Observable<Viewport>      // Viewport observable\n  progress$: Subject<number>           // Progress subject\n}\n\n/* ----------------------------------------------------------------------------\n * Helper functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Handle clicks on internal URLs while skipping external URLs\n *\n * @param ev - Mouse event\n * @param sitemap - Sitemap\n *\n * @returns URL observable\n */\nfunction handle(\n  ev: MouseEvent, sitemap: Sitemap\n): Observable<URL> {\n  if (!(ev.target instanceof Element))\n    return EMPTY\n\n  // Skip, as target is not within a link - clicks on non-link elements are\n  // also captured, which we need to exclude from processing\n  const el = ev.target.closest(\"a\")\n  if (el === null)\n    return EMPTY\n\n  // Skip, as link opens in new window - we now know we have captured a click\n  // on a link, but the link either has a `target` property defined, or the\n  // user pressed the `meta` or `ctrl` key to open it in a new window. Thus,\n  // we need to filter this event as well.\n  if (el.target || ev.metaKey || ev.ctrlKey)\n    return EMPTY\n\n  // Next, we must check if the URL is relevant for us, i.e., if it's an\n  // internal link to a page that is managed by MkDocs. Only then we can be\n  // sure that the structure of the page to be loaded adheres to the current\n  // document structure and can subsequently be injected into it without doing\n  // a full reload. For this reason, we must canonicalize the URL by removing\n  // all search parameters and hash fragments.\n  const url = new URL(el.href)\n  url.search = url.hash = \"\"\n\n  // Skip, if URL is not included in the sitemap - this could be the case when\n  // linking between versions or languages, or to another page that the author\n  // included as part of the build, but that is not managed by MkDocs. In that\n  // case we must not continue with instant navigation.\n  if (!sitemap.has(`${url}`))\n    return EMPTY\n\n  // We now know that we have a link to an internal page, so we prevent the\n  // browser from navigation and emit the URL for instant navigation. Note that\n  // this also includes anchor links, which means we need to implement anchor\n  // positioning ourselves. The reason for this is that if we wouldn't manage\n  // anchor links as well, scroll restoration will not work correctly (e.g.\n  // following an anchor link and scrolling).\n  ev.preventDefault()\n  return of(new URL(el.href))\n}\n\n/**\n * Create a map of head elements for lookup and replacement\n *\n * @param document - Document\n *\n * @returns Tag map\n */\nfunction head(document: Document): Map<string, HTMLElement> {\n  const tags = new Map<string, HTMLElement>()\n  for (const el of getElements(\":scope > *\", document.head))\n    tags.set(el.outerHTML, el)\n\n  // Return tag map\n  return tags\n}\n\n/**\n * Resolve relative URLs in the given document\n *\n * This function resolves relative `href` and `src` attributes, which can belong\n * to all sorts of tags, like meta tags, links, images, scripts and more.\n *\n * @param document - Document\n *\n * @returns Document observable\n */\nfunction resolve(document: Document): Observable<Document> {\n  for (const el of getElements(\"[href], [src]\", document))\n    for (const key of [\"href\", \"src\"]) {\n      const value = el.getAttribute(key)\n      if (value && !/^(?:[a-z]+:)?\\/\\//i.test(value)) {\n        // @ts-expect-error - trick: self-assign to resolve URL\n        el[key] = el[key]\n        break\n      }\n    }\n\n  // Return document observable\n  return of(document)\n}\n\n/**\n * Inject the contents of a document into the current one\n *\n * @param next - Next document\n *\n * @returns Document observable\n */\nfunction inject(next: Document): Observable<Document> {\n  for (const selector of [\n    \"[data-md-component=announce]\",\n    \"[data-md-component=container]\",\n    \"[data-md-component=header-topic]\",\n    \"[data-md-component=outdated]\",\n    \"[data-md-component=logo]\",\n    \"[data-md-component=skip]\",\n    ...feature(\"navigation.tabs.sticky\")\n      ? [\"[data-md-component=tabs]\"]\n      : []\n  ]) {\n    const source = getOptionalElement(selector)\n    const target = getOptionalElement(selector, next)\n    if (\n      typeof source !== \"undefined\" &&\n      typeof target !== \"undefined\"\n    ) {\n      source.replaceWith(target)\n    }\n  }\n\n  // Update meta tags\n  const tags = head(document)\n  for (const [html, el] of head(next))\n    if (tags.has(html))\n      tags.delete(html)\n    else\n      document.head.appendChild(el)\n\n  // Remove meta tags that are not present in the new document\n  for (const el of tags.values()) {\n    const name = el.getAttribute(\"name\")\n    // @todo - find a better way to handle attributes we add dynamically in\n    // other components without mounting components on every navigation, as\n    // this might impact overall performance - see https://t.ly/ehp_O\n    if (name !== \"theme-color\" && name !== \"color-scheme\")\n      el.remove()\n  }\n\n  // After components and meta tags were replaced, re-evaluate scripts\n  // that were provided by the author as part of Markdown files\n  const container = getComponentElement(\"container\")\n  return concat(getElements(\"script\", container))\n    .pipe(\n      switchMap(el => {\n        const script = next.createElement(\"script\")\n        if (el.src) {\n          for (const name of el.getAttributeNames())\n            script.setAttribute(name, el.getAttribute(name)!)\n          el.replaceWith(script)\n\n          // Complete when script is loaded\n          return new Observable(observer => {\n            script.onload = () => observer.complete()\n          })\n\n        // Complete immediately\n        } else {\n          script.textContent = el.textContent\n          el.replaceWith(script)\n          return EMPTY\n        }\n      }),\n      ignoreElements(),\n      endWith(document)\n    )\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Set up instant navigation\n *\n * This is a heavily orchestrated operation - see inline comments to learn how\n * this works with Material for MkDocs, and how you can hook into it.\n *\n * @param options - Options\n *\n * @returns Document observable\n */\nexport function setupInstantNavigation(\n  { location$, viewport$, progress$ }: SetupOptions\n): Observable<Document> {\n  const config = configuration()\n  if (location.protocol === \"file:\")\n    return EMPTY\n\n  // Load sitemap immediately, so we have it available when the user initiates\n  // the first navigation request without any perceivable delay\n  const sitemap$ = fetchSitemap(config.base)\n\n  // Since we might be on a slow connection, the user might trigger multiple\n  // instant navigation events that overlap. MkDocs produces relative URLs for\n  // all internal links, which becomes a problem in this case, because we need\n  // to change the base URL the moment the user clicks a link that should be\n  // intercepted in order to be consistent with popstate, which means that the\n  // base URL would now be incorrect when resolving another relative link from\n  // the same site. For this reason we always resolve all relative links to\n  // absolute links, so we can be sure this never happens.\n  of(document)\n    .subscribe(resolve)\n\n  // --------------------------------------------------------------------------\n  // Navigation interception\n  // --------------------------------------------------------------------------\n\n  // Intercept navigation - to keep the number of event listeners down we use\n  // the fact that uncaptured events bubble up to the body. This has the nice\n  // property that we don't need to detach and then re-attach event listeners\n  // when the document is replaced after a navigation event.\n  const instant$ =\n    fromEvent<MouseEvent>(document.body, \"click\")\n      .pipe(\n        combineLatestWith(sitemap$),\n        switchMap(([ev, sitemap]) => handle(ev, sitemap)),\n        share()\n      )\n\n  // Intercept history change events, e.g. when the user uses the browser's\n  // back or forward buttons, and emit new location for fetching and parsing\n  const history$ =\n    fromEvent<PopStateEvent>(window, \"popstate\")\n      .pipe(\n        map(getLocation),\n        share()\n      )\n\n  // While it would be better UX to defer navigation events until the document\n  // is fully fetched and parsed, we must schedule it here to synchronize with\n  // popstate events, as they are emitted immediately. Moreover we need to\n  // store the current viewport offset for scroll restoration later on.\n  instant$.pipe(withLatestFrom(viewport$))\n    .subscribe(([url, { offset }]) => {\n      history.replaceState(offset, \"\")\n      history.pushState(null, \"\", url)\n    })\n\n  // Emit URLs that should be fetched via instant navigation on location subject\n  // which was passed into this function. The state of instant navigation can be\n  // intercepted by other parts of the application, which can synchronously back\n  // up or restore state before or after instant navigation happens.\n  merge(instant$, history$)\n    .subscribe(location$)\n\n  // --------------------------------------------------------------------------\n  // Fetching and parsing\n  // --------------------------------------------------------------------------\n\n  // Fetch document - we deduplicate requests to the same location, so we don't\n  // end up with multiple requests for the same page. We use `switchMap`, since\n  // we want to cancel the previous request when a new one is triggered, which\n  // is automatically handled by the observable returned by `request`. This is\n  // essential to ensure a good user experience, as we don't want to load pages\n  // that are not needed anymore, e.g., when the user clicks multiple links in\n  // quick succession or on slow connections. If the request fails for some\n  // reason, we fall back and use regular navigation, forcing a reload.\n  const document$ =\n    location$.pipe(\n      distinctUntilKeyChanged(\"pathname\"),\n      switchMap(url => requestHTML(url, { progress$ })\n        .pipe(\n          catchError(() => {\n            setLocation(url, true)\n            return EMPTY\n          })\n        )\n      ),\n\n      // The document was successfully fetched and parsed, so we can inject its\n      // contents into the currently active document\n      switchMap(resolve),\n      switchMap(inject),\n      share()\n    )\n\n  // --------------------------------------------------------------------------\n  // Scroll restoration\n  // --------------------------------------------------------------------------\n\n  // Handle scroll restoration - we must restore the viewport offset after the\n  // document has been fetched and injected, and every time the user clicks an\n  // anchor that leads to an element on the same page, which might also happen\n  // when the user uses the back or forward button.\n  merge(\n    document$.pipe(withLatestFrom(location$, (_, url) => url)),\n\n    // Handle instant navigation events that are triggered by the user clicking\n    // on an anchor link with a hash fragment different from the current one, as\n    // well as from popstate events, which are emitted when the user navigates\n    // back and forth between pages. We use a two-layered subscription to scope\n    // the scroll restoration to the current page, as we don't need to restore\n    // the viewport offset when the user navigates to a different page, as this\n    // is already handled by the previous observable.\n    document$.pipe(\n      switchMap(() => location$),\n      distinctUntilKeyChanged(\"pathname\"),\n      switchMap(() => location$),\n      distinctUntilKeyChanged(\"hash\")\n    ),\n\n    // Handle instant navigation events that are triggered by the user clicking\n    // on an anchor link with the same hash fragment as the current one in the\n    // URL. It is essential that we only intercept those from instant navigation\n    // events and not from history change events, or we'll end up in and endless\n    // loop. The top-level history entry must be removed, as it will be replaced\n    // with a new one, which would otherwise lead to a duplicate entry.\n    location$.pipe(\n      distinctUntilChanged((a, b) => (\n        a.pathname === b.pathname &&\n        a.hash     === b.hash\n      )),\n      switchMap(() => instant$),\n      tap(() => history.back())\n    )\n  )\n    .subscribe(url => {\n\n      // Check if the current history entry has a state, which happens when the\n      // user presses the back or forward button to visit a page we've already\n      // seen. If there's no state, it means a new page was visited and we must\n      // scroll to the top, unless an anchor is given.\n      if (history.state !== null || !url.hash) {\n        window.scrollTo(0, history.state?.y ?? 0)\n      } else {\n        history.scrollRestoration = \"auto\"\n        setLocationHash(url.hash)\n        history.scrollRestoration = \"manual\"\n      }\n    })\n\n  // Disable scroll restoration when an instant navigation event occurs, so the\n  // browser does not immediately set the viewport offset to the prior history\n  // entry, scrolling to the position on the same page, which would look odd.\n  // Instead, we manually restore the position once the page has loaded.\n  location$.subscribe(() => {\n    history.scrollRestoration = \"manual\"\n  })\n\n  // Enable scroll restoration before window unloads - this is essential to\n  // ensure that full reloads (F5) restore the viewport offset correctly. If\n  // only popstate events wouldn't reset the viewport offset prior to their\n  // emission, we could just reset this in popstate. Meh.\n  fromEvent(window, \"beforeunload\")\n    .subscribe(() => {\n      history.scrollRestoration = \"auto\"\n    })\n\n  // Track viewport offset, so we can restore it when the user navigates back\n  // and forth between pages. Note that this must be debounced and cannot be\n  // done in popstate, as popstate has already removed the entry from the\n  // history, which means it is too late.\n  viewport$.pipe(\n    distinctUntilKeyChanged(\"offset\"),\n    debounceTime(100)\n  )\n    .subscribe(({ offset }) => {\n      history.replaceState(offset, \"\")\n    })\n\n  // Return document observable\n  return document$\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport escapeHTML from \"escape-html\"\n\nimport { SearchConfig } from \"../config\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Search highlight function\n *\n * @param value - Value\n *\n * @returns Highlighted value\n */\nexport type SearchHighlightFn = (value: string) => string\n\n/**\n * Search highlight factory function\n *\n * @param query - Query value\n *\n * @returns Search highlight function\n */\nexport type SearchHighlightFactoryFn = (query: string) => SearchHighlightFn\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Create a search highlighter\n *\n * @param config - Search configuration\n *\n * @returns Search highlight factory function\n */\nexport function setupSearchHighlighter(\n  config: SearchConfig\n): SearchHighlightFactoryFn {\n  // Hack: temporarily remove pure lookaheads and lookbehinds\n  const regex = config.separator.split(\"|\").map(term => {\n    const temp = term.replace(/(\\(\\?[!=<][^)]+\\))/g, \"\")\n    return temp.length === 0 ? \"\uFFFD\" : term\n  })\n    .join(\"|\")\n\n  const separator = new RegExp(regex, \"img\")\n  const highlight = (_: unknown, data: string, term: string) => {\n    return `${data}<mark data-md-highlight>${term}</mark>`\n  }\n\n  /* Return factory function */\n  return (query: string) => {\n    query = query\n      .replace(/[\\s*+\\-:~^]+/g, \" \")\n      .trim()\n\n    /* Create search term match expression */\n    const match = new RegExp(`(^|${config.separator}|)(${\n      query\n        .replace(/[|\\\\{}()[\\]^$+*?.-]/g, \"\\\\$&\")\n        .replace(separator, \"|\")\n    })`, \"img\")\n\n    /* Highlight string value */\n    return value => escapeHTML(value)\n      .replace(match, highlight)\n      .replace(/<\\/mark>(\\s+)<mark[^>]*>/img, \"$1\")\n  }\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A RTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { SearchResult } from \"../../_\"\nimport { SearchIndex } from \"../../config\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Search message type\n */\nexport const enum SearchMessageType {\n  SETUP,                               /* Search index setup */\n  READY,                               /* Search index ready */\n  QUERY,                               /* Search query */\n  RESULT                               /* Search results */\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Message containing the data necessary to setup the search index\n */\nexport interface SearchSetupMessage {\n  type: SearchMessageType.SETUP        /* Message type */\n  data: SearchIndex                    /* Message data */\n}\n\n/**\n * Message indicating the search index is ready\n */\nexport interface SearchReadyMessage {\n  type: SearchMessageType.READY        /* Message type */\n}\n\n/**\n * Message containing a search query\n */\nexport interface SearchQueryMessage {\n  type: SearchMessageType.QUERY        /* Message type */\n  data: string                         /* Message data */\n}\n\n/**\n * Message containing results for a search query\n */\nexport interface SearchResultMessage {\n  type: SearchMessageType.RESULT       /* Message type */\n  data: SearchResult                   /* Message data */\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Message exchanged with the search worker\n */\nexport type SearchMessage =\n  | SearchSetupMessage\n  | SearchReadyMessage\n  | SearchQueryMessage\n  | SearchResultMessage\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Type guard for search ready messages\n *\n * @param message - Search worker message\n *\n * @returns Test result\n */\nexport function isSearchReadyMessage(\n  message: SearchMessage\n): message is SearchReadyMessage {\n  return message.type === SearchMessageType.READY\n}\n\n/**\n * Type guard for search result messages\n *\n * @param message - Search worker message\n *\n * @returns Test result\n */\nexport function isSearchResultMessage(\n  message: SearchMessage\n): message is SearchResultMessage {\n  return message.type === SearchMessageType.RESULT\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A RTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  ObservableInput,\n  Subject,\n  first,\n  merge,\n  of,\n  switchMap\n} from \"rxjs\"\n\nimport { feature } from \"~/_\"\nimport { watchToggle, watchWorker } from \"~/browser\"\n\nimport { SearchIndex } from \"../../config\"\nimport {\n  SearchMessage,\n  SearchMessageType\n} from \"../message\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Set up search worker\n *\n * This function creates and initializes a web worker that is used for search,\n * so that the user interface doesn't freeze. In general, the application does\n * not care how search is implemented, as long as the web worker conforms to\n * the format expected by the application as defined in `SearchMessage`. This\n * allows the author to implement custom search functionality, by providing a\n * custom web worker via configuration.\n *\n * Material for MkDocs' built-in search implementation makes use of Lunr.js, an\n * efficient and fast implementation for client-side search. Leveraging a tiny\n * iframe-based web worker shim, search is even supported for the `file://`\n * protocol, enabling search for local non-hosted builds.\n *\n * If the protocol is `file://`, search initialization is deferred to mitigate\n * freezing, as it's now synchronous by design - see https://bit.ly/3C521EO\n *\n * @see https://bit.ly/3igvtQv - How to implement custom search\n *\n * @param url - Worker URL\n * @param index$ - Search index observable input\n *\n * @returns Search worker\n */\nexport function setupSearchWorker(\n  url: string, index$: ObservableInput<SearchIndex>\n): Subject<SearchMessage> {\n  const worker$ = watchWorker<SearchMessage>(url)\n  merge(\n    of(location.protocol !== \"file:\"),\n    watchToggle(\"search\")\n  )\n    .pipe(\n      first(active => active),\n      switchMap(() => index$)\n    )\n      .subscribe(({ config, docs }) => worker$.next({\n        type: SearchMessageType.SETUP,\n        data: {\n          config,\n          docs,\n          options: {\n            suggest: feature(\"search.suggest\")\n          }\n        }\n      }))\n\n  /* Return search worker */\n  return worker$\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  EMPTY,\n  Subject,\n  catchError,\n  combineLatest,\n  filter,\n  fromEvent,\n  map,\n  of,\n  switchMap,\n  withLatestFrom\n} from \"rxjs\"\n\nimport { configuration } from \"~/_\"\nimport {\n  getElement,\n  getLocation,\n  requestJSON,\n  setLocation\n} from \"~/browser\"\nimport { getComponentElements } from \"~/components\"\nimport {\n  Version,\n  renderVersionSelector\n} from \"~/templates\"\n\nimport { fetchSitemap } from \"../sitemap\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Setup options\n */\ninterface SetupOptions {\n  document$: Subject<Document>         /* Document subject */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Set up version selector\n *\n * @param options - Options\n */\nexport function setupVersionSelector(\n  { document$ }: SetupOptions\n): void {\n  const config = configuration()\n  const versions$ = requestJSON<Version[]>(\n    new URL(\"../versions.json\", config.base)\n  )\n    .pipe(\n      catchError(() => EMPTY) // @todo refactor instant loading\n    )\n\n  /* Determine current version */\n  const current$ = versions$\n    .pipe(\n      map(versions => {\n        const [, current] = config.base.match(/([^/]+)\\/?$/)!\n        return versions.find(({ version, aliases }) => (\n          version === current || aliases.includes(current)\n        )) || versions[0]\n      })\n    )\n\n  /* Intercept inter-version navigation */\n  versions$\n    .pipe(\n      map(versions => new Map(versions.map(version => [\n        `${new URL(`../${version.version}/`, config.base)}`,\n        version\n      ]))),\n      switchMap(urls => fromEvent<MouseEvent>(document.body, \"click\")\n        .pipe(\n          filter(ev => !ev.metaKey && !ev.ctrlKey),\n          withLatestFrom(current$),\n          switchMap(([ev, current]) => {\n            if (ev.target instanceof Element) {\n              const el = ev.target.closest(\"a\")\n              if (el && !el.target && urls.has(el.href)) {\n                const url = el.href\n                // This is a temporary hack to detect if a version inside the\n                // version selector or on another part of the site was clicked.\n                // If we're inside the version selector, we definitely want to\n                // find the same page, as we might have different deployments\n                // due to aliases. However, if we're outside the version\n                // selector, we must abort here, because we might otherwise\n                // interfere with instant navigation. We need to refactor this\n                // at some point together with instant navigation.\n                //\n                // See https://github.com/squidfunk/mkdocs-material/issues/4012\n                if (!ev.target.closest(\".md-version\")) {\n                  const version = urls.get(url)!\n                  if (version === current)\n                    return EMPTY\n                }\n                ev.preventDefault()\n                return of(url)\n              }\n            }\n            return EMPTY\n          }),\n          switchMap(url => {\n            return fetchSitemap(new URL(url))\n              .pipe(\n                map(sitemap => {\n                  const location = getLocation()\n                  const path = location.href.replace(config.base, url)\n                  return sitemap.has(path.split(\"#\")[0])\n                    ? new URL(path)\n                    : new URL(url)\n                })\n              )\n          })\n        )\n      )\n    )\n      .subscribe(url => setLocation(url, true))\n\n  /* Render version selector and warning */\n  combineLatest([versions$, current$])\n    .subscribe(([versions, current]) => {\n      const topic = getElement(\".md-header__topic\")\n      topic.appendChild(renderVersionSelector(versions, current))\n    })\n\n  /* Integrate outdated version banner with instant navigation */\n  document$.pipe(switchMap(() => current$))\n    .subscribe(current => {\n\n      /* Check if version state was already determined */\n      let outdated = __md_get(\"__outdated\", sessionStorage)\n      if (outdated === null) {\n        outdated = true\n\n        /* Obtain and normalize default versions */\n        let ignored = config.version?.default || \"latest\"\n        if (!Array.isArray(ignored))\n          ignored = [ignored]\n\n        /* Check if version is considered a default */\n        main: for (const ignore of ignored)\n          for (const version of current.aliases.concat(current.version))\n            if (new RegExp(ignore, \"i\").test(version)) {\n              outdated = false\n              break main\n            }\n\n        /* Persist version state in session storage */\n        __md_set(\"__outdated\", outdated, sessionStorage)\n      }\n\n      /* Unhide outdated version banner */\n      if (outdated)\n        for (const warning of getComponentElements(\"outdated\"))\n          warning.hidden = false\n    })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  combineLatest,\n  distinctUntilChanged,\n  distinctUntilKeyChanged,\n  endWith,\n  finalize,\n  first,\n  fromEvent,\n  ignoreElements,\n  map,\n  merge,\n  shareReplay,\n  takeUntil,\n  tap\n} from \"rxjs\"\n\nimport {\n  getElement,\n  getLocation,\n  setToggle,\n  watchElementFocus,\n  watchToggle\n} from \"~/browser\"\nimport {\n  SearchMessage,\n  SearchMessageType,\n  isSearchReadyMessage\n} from \"~/integrations\"\n\nimport { Component } from \"../../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Search query\n */\nexport interface SearchQuery {\n  value: string                        /* Query value */\n  focus: boolean                       /* Query focus */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch options\n */\ninterface WatchOptions {\n  worker$: Subject<SearchMessage>      /* Search worker */\n}\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  worker$: Subject<SearchMessage>      /* Search worker */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch search query\n *\n * Note that the focus event which triggers re-reading the current query value\n * is delayed by `1ms` so the input's empty state is allowed to propagate.\n *\n * @param el - Search query element\n * @param options - Options\n *\n * @returns Search query observable\n */\nexport function watchSearchQuery(\n  el: HTMLInputElement, { worker$ }: WatchOptions\n): Observable<SearchQuery> {\n\n  /* Support search deep linking */\n  const { searchParams } = getLocation()\n  if (searchParams.has(\"q\")) {\n    setToggle(\"search\", true)\n\n    /* Set query from parameter */\n    el.value = searchParams.get(\"q\")!\n    el.focus()\n\n    /* Remove query parameter on close */\n    watchToggle(\"search\")\n      .pipe(\n        first(active => !active)\n      )\n        .subscribe(() => {\n          const url = getLocation()\n          url.searchParams.delete(\"q\")\n          history.replaceState({}, \"\", `${url}`)\n        })\n  }\n\n  /* Intercept focus and input events */\n  const focus$ = watchElementFocus(el)\n  const value$ = merge(\n    worker$.pipe(first(isSearchReadyMessage)),\n    fromEvent(el, \"keyup\"),\n    focus$\n  )\n    .pipe(\n      map(() => el.value),\n      distinctUntilChanged()\n    )\n\n  /* Combine into single observable */\n  return combineLatest([value$, focus$])\n    .pipe(\n      map(([value, focus]) => ({ value, focus })),\n      shareReplay(1)\n    )\n}\n\n/**\n * Mount search query\n *\n * @param el - Search query element\n * @param options - Options\n *\n * @returns Search query component observable\n */\nexport function mountSearchQuery(\n  el: HTMLInputElement, { worker$ }: MountOptions\n): Observable<Component<SearchQuery, HTMLInputElement>> {\n  const push$ = new Subject<SearchQuery>()\n  const done$ = push$.pipe(ignoreElements(), endWith(true))\n\n  /* Handle value change */\n  combineLatest([\n    worker$.pipe(first(isSearchReadyMessage)),\n    push$\n  ], (_, query) => query)\n    .pipe(\n      distinctUntilKeyChanged(\"value\")\n    )\n      .subscribe(({ value }) => worker$.next({\n        type: SearchMessageType.QUERY,\n        data: value\n      }))\n\n  /* Handle focus change */\n  push$\n    .pipe(\n      distinctUntilKeyChanged(\"focus\")\n    )\n      .subscribe(({ focus }) => {\n        if (focus)\n          setToggle(\"search\", focus)\n      })\n\n  /* Handle reset */\n  fromEvent(el.form!, \"reset\")\n    .pipe(\n      takeUntil(done$)\n    )\n      .subscribe(() => el.focus())\n\n  // Focus search query on label click - note that this is necessary to bring\n  // up the keyboard on iOS and other mobile platforms, as the search dialog is\n  // not visible at first, and programatically focusing an input element must\n  // be triggered by a user interaction - see https://t.ly/Cb30n\n  const label = getElement(\"header [for=__search]\")\n  fromEvent(label, \"click\")\n    .subscribe(() => el.focus())\n\n  /* Create and return component */\n  return watchSearchQuery(el, { worker$ })\n    .pipe(\n      tap(state => push$.next(state)),\n      finalize(() => push$.complete()),\n      map(state => ({ ref: el, ...state })),\n      shareReplay(1)\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  EMPTY,\n  Observable,\n  Subject,\n  bufferCount,\n  filter,\n  finalize,\n  first,\n  fromEvent,\n  map,\n  merge,\n  mergeMap,\n  of,\n  share,\n  skipUntil,\n  switchMap,\n  takeUntil,\n  tap,\n  withLatestFrom,\n  zipWith\n} from \"rxjs\"\n\nimport { translation } from \"~/_\"\nimport {\n  getElement,\n  getOptionalElement,\n  watchElementBoundary,\n  watchToggle\n} from \"~/browser\"\nimport {\n  SearchMessage,\n  SearchResult,\n  isSearchReadyMessage,\n  isSearchResultMessage\n} from \"~/integrations\"\nimport { renderSearchResultItem } from \"~/templates\"\nimport { round } from \"~/utilities\"\n\nimport { Component } from \"../../_\"\nimport { SearchQuery } from \"../query\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  query$: Observable<SearchQuery>      /* Search query observable */\n  worker$: Subject<SearchMessage>      /* Search worker */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount search result list\n *\n * This function performs a lazy rendering of the search results, depending on\n * the vertical offset of the search result container.\n *\n * @param el - Search result list element\n * @param options - Options\n *\n * @returns Search result list component observable\n */\nexport function mountSearchResult(\n  el: HTMLElement, { worker$, query$ }: MountOptions\n): Observable<Component<SearchResult>> {\n  const push$ = new Subject<SearchResult>()\n  const boundary$ = watchElementBoundary(el.parentElement!)\n    .pipe(\n      filter(Boolean)\n    )\n\n  /* Retrieve container */\n  const container = el.parentElement!\n\n  /* Retrieve nested components */\n  const meta = getElement(\":scope > :first-child\", el)\n  const list = getElement(\":scope > :last-child\", el)\n\n  /* Reveal to accessibility tree \u2013 see https://bit.ly/3iAA7t8 */\n  watchToggle(\"search\")\n    .subscribe(active => list.setAttribute(\n      \"role\", active ? \"list\" : \"presentation\"\n    ))\n\n  /* Update search result metadata */\n  push$\n    .pipe(\n      withLatestFrom(query$),\n      skipUntil(worker$.pipe(first(isSearchReadyMessage)))\n    )\n      .subscribe(([{ items }, { value }]) => {\n        switch (items.length) {\n\n          /* No results */\n          case 0:\n            meta.textContent = value.length\n              ? translation(\"search.result.none\")\n              : translation(\"search.result.placeholder\")\n            break\n\n          /* One result */\n          case 1:\n            meta.textContent = translation(\"search.result.one\")\n            break\n\n          /* Multiple result */\n          default:\n            const count = round(items.length)\n            meta.textContent = translation(\"search.result.other\", count)\n        }\n      })\n\n  /* Render search result item */\n  const render$ = push$\n    .pipe(\n      tap(() => list.innerHTML = \"\"),\n      switchMap(({ items }) => merge(\n        of(...items.slice(0, 10)),\n        of(...items.slice(10))\n          .pipe(\n            bufferCount(4),\n            zipWith(boundary$),\n            switchMap(([chunk]) => chunk)\n          )\n      )),\n      map(renderSearchResultItem),\n      share()\n    )\n\n  /* Update search result list */\n  render$.subscribe(item => list.appendChild(item))\n  render$\n    .pipe(\n      mergeMap(item => {\n        const details = getOptionalElement(\"details\", item)\n        if (typeof details === \"undefined\")\n          return EMPTY\n\n        /* Keep position of details element stable */\n        return fromEvent(details, \"toggle\")\n          .pipe(\n            takeUntil(push$),\n            map(() => details)\n          )\n      })\n    )\n      .subscribe(details => {\n        if (\n          details.open === false &&\n          details.offsetTop <= container.scrollTop\n        )\n          container.scrollTo({ top: details.offsetTop })\n      })\n\n  /* Filter search result message */\n  const result$ = worker$\n    .pipe(\n      filter(isSearchResultMessage),\n      map(({ data }) => data)\n    )\n\n  /* Create and return component */\n  return result$\n    .pipe(\n      tap(state => push$.next(state)),\n      finalize(() => push$.complete()),\n      map(state => ({ ref: el, ...state }))\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  endWith,\n  finalize,\n  fromEvent,\n  ignoreElements,\n  map,\n  takeUntil,\n  tap\n} from \"rxjs\"\n\nimport { getLocation } from \"~/browser\"\n\nimport { Component } from \"../../_\"\nimport { SearchQuery } from \"../query\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Search sharing\n */\nexport interface SearchShare {\n  url: URL                             /* Deep link for sharing */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch options\n */\ninterface WatchOptions {\n  query$: Observable<SearchQuery>      /* Search query observable */\n}\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  query$: Observable<SearchQuery>      /* Search query observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount search sharing\n *\n * @param _el - Search sharing element\n * @param options - Options\n *\n * @returns Search sharing observable\n */\nexport function watchSearchShare(\n  _el: HTMLElement, { query$ }: WatchOptions\n): Observable<SearchShare> {\n  return query$\n    .pipe(\n      map(({ value }) => {\n        const url = getLocation()\n        url.hash = \"\"\n\n        /* Compute readable query strings */\n        value = value\n          .replace(/\\s+/g, \"+\")        /* Collapse whitespace */\n          .replace(/&/g, \"%26\")        /* Escape '&' character */\n          .replace(/=/g, \"%3D\")        /* Escape '=' character */\n\n        /* Replace query string */\n        url.search = `q=${value}`\n        return { url }\n      })\n    )\n}\n\n/**\n * Mount search sharing\n *\n * @param el - Search sharing element\n * @param options - Options\n *\n * @returns Search sharing component observable\n */\nexport function mountSearchShare(\n  el: HTMLAnchorElement, options: MountOptions\n): Observable<Component<SearchShare>> {\n  const push$ = new Subject<SearchShare>()\n  const done$ = push$.pipe(ignoreElements(), endWith(true))\n  push$.subscribe(({ url }) => {\n    el.setAttribute(\"data-clipboard-text\", el.href)\n    el.href = `${url}`\n  })\n\n  /* Prevent following of link */\n  fromEvent(el, \"click\")\n    .pipe(\n      takeUntil(done$)\n    )\n      .subscribe(ev => ev.preventDefault())\n\n  /* Create and return component */\n  return watchSearchShare(el, options)\n    .pipe(\n      tap(state => push$.next(state)),\n      finalize(() => push$.complete()),\n      map(state => ({ ref: el, ...state }))\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  asyncScheduler,\n  combineLatestWith,\n  distinctUntilChanged,\n  filter,\n  finalize,\n  fromEvent,\n  map,\n  merge,\n  observeOn,\n  tap\n} from \"rxjs\"\n\nimport { Keyboard } from \"~/browser\"\nimport {\n  SearchMessage,\n  SearchResult,\n  isSearchResultMessage\n} from \"~/integrations\"\n\nimport { Component, getComponentElement } from \"../../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Search suggestions\n */\nexport interface SearchSuggest {}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  keyboard$: Observable<Keyboard>      /* Keyboard observable */\n  worker$: Subject<SearchMessage>      /* Search worker */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount search suggestions\n *\n * This function will perform a lazy rendering of the search results, depending\n * on the vertical offset of the search result container.\n *\n * @param el - Search result list element\n * @param options - Options\n *\n * @returns Search result list component observable\n */\nexport function mountSearchSuggest(\n  el: HTMLElement, { worker$, keyboard$ }: MountOptions\n): Observable<Component<SearchSuggest>> {\n  const push$ = new Subject<SearchResult>()\n\n  /* Retrieve query component and track all changes */\n  const query  = getComponentElement(\"search-query\")\n  const query$ = merge(\n    fromEvent(query, \"keydown\"),\n    fromEvent(query, \"focus\")\n  )\n    .pipe(\n      observeOn(asyncScheduler),\n      map(() => query.value),\n      distinctUntilChanged(),\n    )\n\n  /* Update search suggestions */\n  push$\n    .pipe(\n      combineLatestWith(query$),\n      map(([{ suggest }, value]) => {\n        const words = value.split(/([\\s-]+)/)\n        if (suggest?.length && words[words.length - 1]) {\n          const last = suggest[suggest.length - 1]\n          if (last.startsWith(words[words.length - 1]))\n            words[words.length - 1] = last\n        } else {\n          words.length = 0\n        }\n        return words\n      })\n    )\n      .subscribe(words => el.innerHTML = words\n        .join(\"\")\n        .replace(/\\s/g, \"&nbsp;\")\n      )\n\n  /* Set up search keyboard handlers */\n  keyboard$\n    .pipe(\n      filter(({ mode }) => mode === \"search\")\n    )\n      .subscribe(key => {\n        switch (key.type) {\n\n          /* Right arrow: accept current suggestion */\n          case \"ArrowRight\":\n            if (\n              el.innerText.length &&\n              query.selectionStart === query.value.length\n            )\n              query.value = el.innerText\n            break\n        }\n      })\n\n  /* Filter search result message */\n  const result$ = worker$\n    .pipe(\n      filter(isSearchResultMessage),\n      map(({ data }) => data)\n    )\n\n  /* Create and return component */\n  return result$\n    .pipe(\n      tap(state => push$.next(state)),\n      finalize(() => push$.complete()),\n      map(() => ({ ref: el }))\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  NEVER,\n  Observable,\n  ObservableInput,\n  filter,\n  fromEvent,\n  merge,\n  mergeWith\n} from \"rxjs\"\n\nimport { configuration } from \"~/_\"\nimport {\n  Keyboard,\n  getActiveElement,\n  getElements,\n  setToggle\n} from \"~/browser\"\nimport {\n  SearchIndex,\n  SearchResult,\n  setupSearchWorker\n} from \"~/integrations\"\n\nimport {\n  Component,\n  getComponentElement,\n  getComponentElements\n} from \"../../_\"\nimport {\n  SearchQuery,\n  mountSearchQuery\n} from \"../query\"\nimport { mountSearchResult } from \"../result\"\nimport {\n  SearchShare,\n  mountSearchShare\n} from \"../share\"\nimport {\n  SearchSuggest,\n  mountSearchSuggest\n} from \"../suggest\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Search\n */\nexport type Search =\n  | SearchQuery\n  | SearchResult\n  | SearchShare\n  | SearchSuggest\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  index$: ObservableInput<SearchIndex> /* Search index observable */\n  keyboard$: Observable<Keyboard>      /* Keyboard observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount search\n *\n * This function sets up the search functionality, including the underlying\n * web worker and all keyboard bindings.\n *\n * @param el - Search element\n * @param options - Options\n *\n * @returns Search component observable\n */\nexport function mountSearch(\n  el: HTMLElement, { index$, keyboard$ }: MountOptions\n): Observable<Component<Search>> {\n  const config = configuration()\n  try {\n    const worker$ = setupSearchWorker(config.search, index$)\n\n    /* Retrieve query and result components */\n    const query  = getComponentElement(\"search-query\", el)\n    const result = getComponentElement(\"search-result\", el)\n\n    /* Always close search on result selection */\n    fromEvent<PointerEvent>(el, \"click\")\n      .pipe(\n        filter(({ target }) => (\n          target instanceof Element && !!target.closest(\"a\")\n        ))\n      )\n        .subscribe(() => setToggle(\"search\", false))\n\n    /* Set up search keyboard handlers */\n    keyboard$\n      .pipe(\n        filter(({ mode }) => mode === \"search\")\n      )\n        .subscribe(key => {\n          const active = getActiveElement()\n          switch (key.type) {\n\n            /* Enter: go to first (best) result */\n            case \"Enter\":\n              if (active === query) {\n                const anchors = new Map<HTMLAnchorElement, number>()\n                for (const anchor of getElements<HTMLAnchorElement>(\n                  \":first-child [href]\", result\n                )) {\n                  const article = anchor.firstElementChild!\n                  anchors.set(anchor, parseFloat(\n                    article.getAttribute(\"data-md-score\")!\n                  ))\n                }\n\n                /* Go to result with highest score, if any */\n                if (anchors.size) {\n                  const [[best]] = [...anchors].sort(([, a], [, b]) => b - a)\n                  best.click()\n                }\n\n                /* Otherwise omit form submission */\n                key.claim()\n              }\n              break\n\n            /* Escape or Tab: close search */\n            case \"Escape\":\n            case \"Tab\":\n              setToggle(\"search\", false)\n              query.blur()\n              break\n\n            /* Vertical arrows: select previous or next search result */\n            case \"ArrowUp\":\n            case \"ArrowDown\":\n              if (typeof active === \"undefined\") {\n                query.focus()\n              } else {\n                const els = [query, ...getElements(\n                  \":not(details) > [href], summary, details[open] [href]\",\n                  result\n                )]\n                const i = Math.max(0, (\n                  Math.max(0, els.indexOf(active)) + els.length + (\n                    key.type === \"ArrowUp\" ? -1 : +1\n                  )\n                ) % els.length)\n                els[i].focus()\n              }\n\n              /* Prevent scrolling of page */\n              key.claim()\n              break\n\n            /* All other keys: hand to search query */\n            default:\n              if (query !== getActiveElement())\n                query.focus()\n          }\n        })\n\n    /* Set up global keyboard handlers */\n    keyboard$\n      .pipe(\n        filter(({ mode }) => mode === \"global\")\n      )\n        .subscribe(key => {\n          switch (key.type) {\n\n            /* Open search and select query */\n            case \"f\":\n            case \"s\":\n            case \"/\":\n              query.focus()\n              query.select()\n\n              /* Prevent scrolling of page */\n              key.claim()\n              break\n          }\n        })\n\n    /* Create and return component */\n    const query$ = mountSearchQuery(query, { worker$ })\n    return merge(\n      query$,\n      mountSearchResult(result, { worker$, query$ })\n    )\n      .pipe(\n        mergeWith(\n\n          /* Search sharing */\n          ...getComponentElements(\"search-share\", el)\n            .map(child => mountSearchShare(child, { query$ })),\n\n          /* Search suggestions */\n          ...getComponentElements(\"search-suggest\", el)\n            .map(child => mountSearchSuggest(child, { worker$, keyboard$ }))\n        )\n      )\n\n  /* Gracefully handle broken search */\n  } catch (err) {\n    el.hidden = true\n    return NEVER\n  }\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  ObservableInput,\n  combineLatest,\n  filter,\n  map,\n  startWith\n} from \"rxjs\"\n\nimport { getLocation } from \"~/browser\"\nimport {\n  SearchIndex,\n  setupSearchHighlighter\n} from \"~/integrations\"\nimport { h } from \"~/utilities\"\n\nimport { Component } from \"../../_\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Search highlighting\n */\nexport interface SearchHighlight {\n  nodes: Map<ChildNode, string>        /* Map of replacements */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  index$: ObservableInput<SearchIndex> /* Search index observable */\n  location$: Observable<URL>           /* Location observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Mount search highlighting\n *\n * @param el - Content element\n * @param options - Options\n *\n * @returns Search highlighting component observable\n */\nexport function mountSearchHiglight(\n  el: HTMLElement, { index$, location$ }: MountOptions\n): Observable<Component<SearchHighlight>> {\n  return combineLatest([\n    index$,\n    location$\n      .pipe(\n        startWith(getLocation()),\n        filter(url => !!url.searchParams.get(\"h\"))\n      )\n  ])\n    .pipe(\n      map(([index, url]) => setupSearchHighlighter(index.config)(\n        url.searchParams.get(\"h\")!\n      )),\n      map(fn => {\n        const nodes = new Map<ChildNode, string>()\n\n        /* Traverse text nodes and collect matches */\n        const it = document.createNodeIterator(el, NodeFilter.SHOW_TEXT)\n        for (let node = it.nextNode(); node; node = it.nextNode()) {\n          if (node.parentElement?.offsetHeight) {\n            const original = node.textContent!\n            const replaced = fn(original)\n            if (replaced.length > original.length)\n              nodes.set(node as ChildNode, replaced)\n          }\n        }\n\n        /* Replace original nodes with matches */\n        for (const [node, text] of nodes) {\n          const { childNodes } = h(\"span\", null, text)\n          node.replaceWith(...Array.from(childNodes))\n        }\n\n        /* Return component */\n        return { ref: el, nodes }\n      })\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  animationFrameScheduler,\n  asyncScheduler,\n  auditTime,\n  combineLatest,\n  defer,\n  distinctUntilChanged,\n  endWith,\n  finalize,\n  first,\n  from,\n  fromEvent,\n  ignoreElements,\n  map,\n  mergeMap,\n  observeOn,\n  takeUntil,\n  tap,\n  withLatestFrom\n} from \"rxjs\"\n\nimport {\n  Viewport,\n  getElement,\n  getElementOffset,\n  getElementSize,\n  getElements\n} from \"~/browser\"\n\nimport { Component } from \"../_\"\nimport { Header } from \"../header\"\nimport { Main } from \"../main\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Sidebar\n */\nexport interface Sidebar {\n  height: number                       /* Sidebar height */\n  locked: boolean                      /* Sidebar is locked */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch options\n */\ninterface WatchOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  main$: Observable<Main>              /* Main area observable */\n}\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  header$: Observable<Header>          /* Header observable */\n  main$: Observable<Main>              /* Main area observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch sidebar\n *\n * This function returns an observable that computes the visual parameters of\n * the sidebar which depends on the vertical viewport offset, as well as the\n * height of the main area. When the page is scrolled beyond the header, the\n * sidebar is locked and fills the remaining space.\n *\n * @param el - Sidebar element\n * @param options - Options\n *\n * @returns Sidebar observable\n */\nexport function watchSidebar(\n  el: HTMLElement, { viewport$, main$ }: WatchOptions\n): Observable<Sidebar> {\n  const parent = el.closest<HTMLElement>(\".md-grid\")!\n  const adjust =\n    parent.offsetTop -\n    parent.parentElement!.offsetTop\n\n  /* Compute the sidebar's available height and if it should be locked */\n  return combineLatest([main$, viewport$])\n    .pipe(\n      map(([{ offset, height }, { offset: { y } }]) => {\n        height = height\n          + Math.min(adjust, Math.max(0, y - offset))\n          - adjust\n        return {\n          height,\n          locked: y >= offset + adjust\n        }\n      }),\n      distinctUntilChanged((a, b) => (\n        a.height === b.height &&\n        a.locked === b.locked\n      ))\n    )\n}\n\n/**\n * Mount sidebar\n *\n * This function doesn't set the height of the actual sidebar, but of its first\n * child \u2013 the `.md-sidebar__scrollwrap` element in order to mitigiate jittery\n * sidebars when the footer is scrolled into view. At some point we switched\n * from `absolute` / `fixed` positioning to `sticky` positioning, significantly\n * reducing jitter in some browsers (respectively Firefox and Safari) when\n * scrolling from the top. However, top-aligned sticky positioning means that\n * the sidebar snaps to the bottom when the end of the container is reached.\n * This is what leads to the mentioned jitter, as the sidebar's height may be\n * updated too slowly.\n *\n * This behaviour can be mitigiated by setting the height of the sidebar to `0`\n * while preserving the padding, and the height on its first element.\n *\n * @param el - Sidebar element\n * @param options - Options\n *\n * @returns Sidebar component observable\n */\nexport function mountSidebar(\n  el: HTMLElement, { header$, ...options }: MountOptions\n): Observable<Component<Sidebar>> {\n  const inner = getElement(\".md-sidebar__scrollwrap\", el)\n  const { y } = getElementOffset(inner)\n  return defer(() => {\n    const push$ = new Subject<Sidebar>()\n    const done$ = push$.pipe(ignoreElements(), endWith(true))\n    const next$ = push$\n      .pipe(\n        auditTime(0, animationFrameScheduler)\n      )\n\n    /* Update sidebar height and offset */\n    next$.pipe(withLatestFrom(header$))\n      .subscribe({\n\n        /* Handle emission */\n        next([{ height }, { height: offset }]) {\n          inner.style.height = `${height - 2 * y}px`\n          el.style.top       = `${offset}px`\n        },\n\n        /* Handle complete */\n        complete() {\n          inner.style.height = \"\"\n          el.style.top       = \"\"\n        }\n      })\n\n    /* Bring active item into view on initial load */\n    next$.pipe(first())\n      .subscribe(() => {\n        for (const item of getElements(\".md-nav__link--active[href]\", el)) {\n          if (!item.clientHeight) // skip invisible toc in left sidebar\n            continue\n          const container = item.closest<HTMLElement>(\".md-sidebar__scrollwrap\")!\n          if (typeof container !== \"undefined\") {\n            const offset = item.offsetTop - container.offsetTop\n            const { height } = getElementSize(container)\n            container.scrollTo({\n              top: offset - height / 2\n            })\n          }\n        }\n      })\n\n    /* Handle accessibility for expandable items, see https://bit.ly/3jaod9p */\n    from(getElements<HTMLLabelElement>(\"label[tabindex]\", el))\n      .pipe(\n        mergeMap(label => fromEvent(label, \"click\")\n          .pipe(\n            observeOn(asyncScheduler),\n            map(() => label),\n            takeUntil(done$)\n          )\n        )\n      )\n        .subscribe(label => {\n          const input = getElement<HTMLInputElement>(`[id=\"${label.htmlFor}\"]`)\n          const nav = getElement(`[aria-labelledby=\"${label.id}\"]`)\n          nav.setAttribute(\"aria-expanded\", `${input.checked}`)\n        })\n\n    /* Create and return component */\n    return watchSidebar(el, options)\n      .pipe(\n        tap(state => push$.next(state)),\n        finalize(() => push$.complete()),\n        map(state => ({ ref: el, ...state }))\n      )\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { Repo, User } from \"github-types\"\nimport {\n  EMPTY,\n  Observable,\n  catchError,\n  defaultIfEmpty,\n  map,\n  zip\n} from \"rxjs\"\n\nimport { requestJSON } from \"~/browser\"\n\nimport { SourceFacts } from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * GitHub release (partial)\n */\ninterface Release {\n  tag_name: string                     /* Tag name */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Fetch GitHub repository facts\n *\n * @param user - GitHub user or organization\n * @param repo - GitHub repository\n *\n * @returns Repository facts observable\n */\nexport function fetchSourceFactsFromGitHub(\n  user: string, repo?: string\n): Observable<SourceFacts> {\n  if (typeof repo !== \"undefined\") {\n    const url = `https://api.github.com/repos/${user}/${repo}`\n    return zip(\n\n      /* Fetch version */\n      requestJSON<Release>(`${url}/releases/latest`)\n        .pipe(\n          catchError(() => EMPTY), // @todo refactor instant loading\n          map(release => ({\n            version: release.tag_name\n          })),\n          defaultIfEmpty({})\n        ),\n\n      /* Fetch stars and forks */\n      requestJSON<Repo>(url)\n        .pipe(\n          catchError(() => EMPTY), // @todo refactor instant loading\n          map(info => ({\n            stars: info.stargazers_count,\n            forks: info.forks_count\n          })),\n          defaultIfEmpty({})\n        )\n    )\n      .pipe(\n        map(([release, info]) => ({ ...release, ...info }))\n      )\n\n  /* User or organization */\n  } else {\n    const url = `https://api.github.com/users/${user}`\n    return requestJSON<User>(url)\n      .pipe(\n        map(info => ({\n          repositories: info.public_repos\n        })),\n        defaultIfEmpty({})\n      )\n  }\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { ProjectSchema } from \"gitlab\"\nimport {\n  EMPTY,\n  Observable,\n  catchError,\n  defaultIfEmpty,\n  map\n} from \"rxjs\"\n\nimport { requestJSON } from \"~/browser\"\n\nimport { SourceFacts } from \"../_\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Fetch GitLab repository facts\n *\n * @param base - GitLab base\n * @param project - GitLab project\n *\n * @returns Repository facts observable\n */\nexport function fetchSourceFactsFromGitLab(\n  base: string, project: string\n): Observable<SourceFacts> {\n  const url = `https://${base}/api/v4/projects/${encodeURIComponent(project)}`\n  return requestJSON<ProjectSchema>(url)\n    .pipe(\n      catchError(() => EMPTY), // @todo refactor instant loading\n      map(({ star_count, forks_count }) => ({\n        stars: star_count,\n        forks: forks_count\n      })),\n      defaultIfEmpty({})\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { EMPTY, Observable } from \"rxjs\"\n\nimport { fetchSourceFactsFromGitHub } from \"../github\"\nimport { fetchSourceFactsFromGitLab } from \"../gitlab\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Repository facts for repositories\n */\nexport interface RepositoryFacts {\n  stars?: number                       /* Number of stars */\n  forks?: number                       /* Number of forks */\n  version?: string                     /* Latest version */\n}\n\n/**\n * Repository facts for organizations\n */\nexport interface OrganizationFacts {\n  repositories?: number                /* Number of repositories */\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Repository facts\n */\nexport type SourceFacts =\n  | RepositoryFacts\n  | OrganizationFacts\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Fetch repository facts\n *\n * @param url - Repository URL\n *\n * @returns Repository facts observable\n */\nexport function fetchSourceFacts(\n  url: string\n): Observable<SourceFacts> {\n\n  /* Try to match GitHub repository */\n  let match = url.match(/^.+github\\.com\\/([^/]+)\\/?([^/]+)?/i)\n  if (match) {\n    const [, user, repo] = match\n    return fetchSourceFactsFromGitHub(user, repo)\n  }\n\n  /* Try to match GitLab repository */\n  match = url.match(/^.+?([^/]*gitlab[^/]+)\\/(.+?)\\/?$/i)\n  if (match) {\n    const [, base, slug] = match\n    return fetchSourceFactsFromGitLab(base, slug)\n  }\n\n  /* Fallback */\n  return EMPTY\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  EMPTY,\n  Observable,\n  Subject,\n  catchError,\n  defer,\n  filter,\n  finalize,\n  map,\n  of,\n  shareReplay,\n  tap\n} from \"rxjs\"\n\nimport { getElement } from \"~/browser\"\nimport { ConsentDefaults } from \"~/components/consent\"\nimport { renderSourceFacts } from \"~/templates\"\n\nimport {\n  Component,\n  getComponentElements\n} from \"../../_\"\nimport {\n  SourceFacts,\n  fetchSourceFacts\n} from \"../facts\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Repository information\n */\nexport interface Source {\n  facts: SourceFacts                   /* Repository facts */\n}\n\n/* ----------------------------------------------------------------------------\n * Data\n * ------------------------------------------------------------------------- */\n\n/**\n * Repository information observable\n */\nlet fetch$: Observable<Source>\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch repository information\n *\n * This function tries to read the repository facts from session storage, and\n * if unsuccessful, fetches them from the underlying provider.\n *\n * @param el - Repository information element\n *\n * @returns Repository information observable\n */\nexport function watchSource(\n  el: HTMLAnchorElement\n): Observable<Source> {\n  return fetch$ ||= defer(() => {\n    const cached = __md_get<SourceFacts>(\"__source\", sessionStorage)\n    if (cached) {\n      return of(cached)\n    } else {\n\n      /* Check if consent is configured and was given */\n      const els = getComponentElements(\"consent\")\n      if (els.length) {\n        const consent = __md_get<ConsentDefaults>(\"__consent\")\n        if (!(consent && consent.github))\n          return EMPTY\n      }\n\n      /* Fetch repository facts */\n      return fetchSourceFacts(el.href)\n        .pipe(\n          tap(facts => __md_set(\"__source\", facts, sessionStorage))\n        )\n    }\n  })\n    .pipe(\n      catchError(() => EMPTY),\n      filter(facts => Object.keys(facts).length > 0),\n      map(facts => ({ facts })),\n      shareReplay(1)\n    )\n}\n\n/**\n * Mount repository information\n *\n * @param el - Repository information element\n *\n * @returns Repository information component observable\n */\nexport function mountSource(\n  el: HTMLAnchorElement\n): Observable<Component<Source>> {\n  const inner = getElement(\":scope > :last-child\", el)\n  return defer(() => {\n    const push$ = new Subject<Source>()\n    push$.subscribe(({ facts }) => {\n      inner.appendChild(renderSourceFacts(facts))\n      inner.classList.add(\"md-source__repository--active\")\n    })\n\n    /* Create and return component */\n    return watchSource(el)\n      .pipe(\n        tap(state => push$.next(state)),\n        finalize(() => push$.complete()),\n        map(state => ({ ref: el, ...state }))\n      )\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  defer,\n  distinctUntilKeyChanged,\n  finalize,\n  map,\n  of,\n  switchMap,\n  tap\n} from \"rxjs\"\n\nimport { feature } from \"~/_\"\nimport {\n  Viewport,\n  watchElementSize,\n  watchViewportAt\n} from \"~/browser\"\n\nimport { Component } from \"../_\"\nimport { Header } from \"../header\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Navigation tabs\n */\nexport interface Tabs {\n  hidden: boolean                      /* Navigation tabs are hidden */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch options\n */\ninterface WatchOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  header$: Observable<Header>          /* Header observable */\n}\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  header$: Observable<Header>          /* Header observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch navigation tabs\n *\n * @param el - Navigation tabs element\n * @param options - Options\n *\n * @returns Navigation tabs observable\n */\nexport function watchTabs(\n  el: HTMLElement, { viewport$, header$ }: WatchOptions\n): Observable<Tabs> {\n  return watchElementSize(document.body)\n    .pipe(\n      switchMap(() => watchViewportAt(el, { header$, viewport$ })),\n      map(({ offset: { y } }) => {\n        return {\n          hidden: y >= 10\n        }\n      }),\n      distinctUntilKeyChanged(\"hidden\")\n    )\n}\n\n/**\n * Mount navigation tabs\n *\n * This function hides the navigation tabs when scrolling past the threshold\n * and makes them reappear in a nice CSS animation when scrolling back up.\n *\n * @param el - Navigation tabs element\n * @param options - Options\n *\n * @returns Navigation tabs component observable\n */\nexport function mountTabs(\n  el: HTMLElement, options: MountOptions\n): Observable<Component<Tabs>> {\n  return defer(() => {\n    const push$ = new Subject<Tabs>()\n    push$.subscribe({\n\n      /* Handle emission */\n      next({ hidden }) {\n        el.hidden = hidden\n      },\n\n      /* Handle complete */\n      complete() {\n        el.hidden = false\n      }\n    })\n\n    /* Create and return component */\n    return (\n      feature(\"navigation.tabs.sticky\")\n        ? of({ hidden: false })\n        : watchTabs(el, options)\n    )\n      .pipe(\n        tap(state => push$.next(state)),\n        finalize(() => push$.complete()),\n        map(state => ({ ref: el, ...state }))\n      )\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  asyncScheduler,\n  bufferCount,\n  combineLatestWith,\n  debounceTime,\n  defer,\n  distinctUntilChanged,\n  distinctUntilKeyChanged,\n  endWith,\n  filter,\n  finalize,\n  ignoreElements,\n  map,\n  merge,\n  observeOn,\n  of,\n  repeat,\n  scan,\n  share,\n  skip,\n  startWith,\n  switchMap,\n  takeUntil,\n  tap,\n  withLatestFrom\n} from \"rxjs\"\n\nimport { feature } from \"~/_\"\nimport {\n  Viewport,\n  getElement,\n  getElementContainer,\n  getElementSize,\n  getElements,\n  getLocation,\n  getOptionalElement,\n  watchElementSize\n} from \"~/browser\"\n\nimport {\n  Component,\n  getComponentElement\n} from \"../_\"\nimport { Header } from \"../header\"\nimport { Main } from \"../main\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Table of contents\n */\nexport interface TableOfContents {\n  prev: HTMLAnchorElement[][]          /* Anchors (previous) */\n  next: HTMLAnchorElement[][]          /* Anchors (next) */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch options\n */\ninterface WatchOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  header$: Observable<Header>          /* Header observable */\n}\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  header$: Observable<Header>          /* Header observable */\n  main$: Observable<Main>              /* Main area observable */\n  target$: Observable<HTMLElement>     /* Location target observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch table of contents\n *\n * This is effectively a scroll spy implementation which will account for the\n * fixed header and automatically re-calculate anchor offsets when the viewport\n * is resized. The returned observable will only emit if the table of contents\n * needs to be repainted.\n *\n * This implementation tracks an anchor element's entire path starting from its\n * level up to the top-most anchor element, e.g. `[h3, h2, h1]`. Although the\n * Material theme currently doesn't make use of this information, it enables\n * the styling of the entire hierarchy through customization.\n *\n * Note that the current anchor is the last item of the `prev` anchor list.\n *\n * @param el - Table of contents element\n * @param options - Options\n *\n * @returns Table of contents observable\n */\nexport function watchTableOfContents(\n  el: HTMLElement, { viewport$, header$ }: WatchOptions\n): Observable<TableOfContents> {\n  const table = new Map<HTMLAnchorElement, HTMLElement>()\n\n  /* Compute anchor-to-target mapping */\n  const anchors = getElements<HTMLAnchorElement>(\".md-nav__link\", el)\n  for (const anchor of anchors) {\n    const id = decodeURIComponent(anchor.hash.substring(1))\n    const target = getOptionalElement(`[id=\"${id}\"]`)\n    if (typeof target !== \"undefined\")\n      table.set(anchor, target)\n  }\n\n  /* Compute necessary adjustment for header */\n  const adjust$ = header$\n    .pipe(\n      distinctUntilKeyChanged(\"height\"),\n      map(({ height }) => {\n        const main = getComponentElement(\"main\")\n        const grid = getElement(\":scope > :first-child\", main)\n        return height + 0.8 * (\n          grid.offsetTop -\n          main.offsetTop\n        )\n      }),\n      share()\n    )\n\n  /* Compute partition of previous and next anchors */\n  const partition$ = watchElementSize(document.body)\n    .pipe(\n      distinctUntilKeyChanged(\"height\"),\n\n      /* Build index to map anchor paths to vertical offsets */\n      switchMap(body => defer(() => {\n        let path: HTMLAnchorElement[] = []\n        return of([...table].reduce((index, [anchor, target]) => {\n          while (path.length) {\n            const last = table.get(path[path.length - 1])!\n            if (last.tagName >= target.tagName) {\n              path.pop()\n            } else {\n              break\n            }\n          }\n\n          /* If the current anchor is hidden, continue with its parent */\n          let offset = target.offsetTop\n          while (!offset && target.parentElement) {\n            target = target.parentElement\n            offset = target.offsetTop\n          }\n\n          /* Fix anchor offsets in tables - see https://bit.ly/3CUFOcn */\n          let parent = target.offsetParent as HTMLElement\n          for (; parent; parent = parent.offsetParent as HTMLElement)\n            offset += parent.offsetTop\n\n          /* Map reversed anchor path to vertical offset */\n          return index.set(\n            [...path = [...path, anchor]].reverse(),\n            offset\n          )\n        }, new Map<HTMLAnchorElement[], number>()))\n      })\n        .pipe(\n\n          /* Sort index by vertical offset (see https://bit.ly/30z6QSO) */\n          map(index => new Map([...index].sort(([, a], [, b]) => a - b))),\n          combineLatestWith(adjust$),\n\n          /* Re-compute partition when viewport offset changes */\n          switchMap(([index, adjust]) => viewport$\n            .pipe(\n              scan(([prev, next], { offset: { y }, size }) => {\n                const last = y + size.height >= Math.floor(body.height)\n\n                /* Look forward */\n                while (next.length) {\n                  const [, offset] = next[0]\n                  if (offset - adjust < y || last) {\n                    prev = [...prev, next.shift()!]\n                  } else {\n                    break\n                  }\n                }\n\n                /* Look backward */\n                while (prev.length) {\n                  const [, offset] = prev[prev.length - 1]\n                  if (offset - adjust >= y && !last) {\n                    next = [prev.pop()!, ...next]\n                  } else {\n                    break\n                  }\n                }\n\n                /* Return partition */\n                return [prev, next]\n              }, [[], [...index]]),\n              distinctUntilChanged((a, b) => (\n                a[0] === b[0] &&\n                a[1] === b[1]\n              ))\n            )\n          )\n        )\n      )\n    )\n\n  /* Compute and return anchor list migrations */\n  return partition$\n    .pipe(\n      map(([prev, next]) => ({\n        prev: prev.map(([path]) => path),\n        next: next.map(([path]) => path)\n      })),\n\n      /* Extract anchor list migrations */\n      startWith({ prev: [], next: [] }),\n      bufferCount(2, 1),\n      map(([a, b]) => {\n\n        /* Moving down */\n        if (a.prev.length < b.prev.length) {\n          return {\n            prev: b.prev.slice(Math.max(0, a.prev.length - 1), b.prev.length),\n            next: []\n          }\n\n        /* Moving up */\n        } else {\n          return {\n            prev: b.prev.slice(-1),\n            next: b.next.slice(0, b.next.length - a.next.length)\n          }\n        }\n      })\n    )\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Mount table of contents\n *\n * @param el - Table of contents element\n * @param options - Options\n *\n * @returns Table of contents component observable\n */\nexport function mountTableOfContents(\n  el: HTMLElement, { viewport$, header$, main$, target$ }: MountOptions\n): Observable<Component<TableOfContents>> {\n  return defer(() => {\n    const push$ = new Subject<TableOfContents>()\n    const done$ = push$.pipe(ignoreElements(), endWith(true))\n    push$.subscribe(({ prev, next }) => {\n\n      /* Look forward */\n      for (const [anchor] of next) {\n        anchor.classList.remove(\"md-nav__link--passed\")\n        anchor.classList.remove(\"md-nav__link--active\")\n      }\n\n      /* Look backward */\n      for (const [index, [anchor]] of prev.entries()) {\n        anchor.classList.add(\"md-nav__link--passed\")\n        anchor.classList.toggle(\n          \"md-nav__link--active\",\n          index === prev.length - 1\n        )\n      }\n    })\n\n    /* Set up following, if enabled */\n    if (feature(\"toc.follow\")) {\n\n      /* Toggle smooth scrolling only for anchor clicks */\n      const smooth$ = merge(\n        viewport$.pipe(debounceTime(1), map(() => undefined)),\n        viewport$.pipe(debounceTime(250), map(() => \"smooth\" as const))\n      )\n\n      /* Bring active anchor into view */ // @todo: refactor\n      push$\n        .pipe(\n          filter(({ prev }) => prev.length > 0),\n          combineLatestWith(main$.pipe(observeOn(asyncScheduler))),\n          withLatestFrom(smooth$)\n        )\n          .subscribe(([[{ prev }], behavior]) => {\n            const [anchor] = prev[prev.length - 1]\n            if (anchor.offsetHeight) {\n\n              /* Retrieve overflowing container and scroll */\n              const container = getElementContainer(anchor)\n              if (typeof container !== \"undefined\") {\n                const offset = anchor.offsetTop - container.offsetTop\n                const { height } = getElementSize(container)\n                container.scrollTo({\n                  top: offset - height / 2,\n                  behavior\n                })\n              }\n            }\n          })\n    }\n\n    /* Set up anchor tracking, if enabled */\n    if (feature(\"navigation.tracking\"))\n      viewport$\n        .pipe(\n          takeUntil(done$),\n          distinctUntilKeyChanged(\"offset\"),\n          debounceTime(250),\n          skip(1),\n          takeUntil(target$.pipe(skip(1))),\n          repeat({ delay: 250 }),\n          withLatestFrom(push$)\n        )\n          .subscribe(([, { prev }]) => {\n            const url = getLocation()\n\n            /* Set hash fragment to active anchor */\n            const anchor = prev[prev.length - 1]\n            if (anchor && anchor.length) {\n              const [active] = anchor\n              const { hash } = new URL(active.href)\n              if (url.hash !== hash) {\n                url.hash = hash\n                history.replaceState({}, \"\", `${url}`)\n              }\n\n            /* Reset anchor when at the top */\n            } else {\n              url.hash = \"\"\n              history.replaceState({}, \"\", `${url}`)\n            }\n          })\n\n    /* Create and return component */\n    return watchTableOfContents(el, { viewport$, header$ })\n      .pipe(\n        tap(state => push$.next(state)),\n        finalize(() => push$.complete()),\n        map(state => ({ ref: el, ...state }))\n      )\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  Subject,\n  bufferCount,\n  combineLatest,\n  distinctUntilChanged,\n  distinctUntilKeyChanged,\n  endWith,\n  finalize,\n  fromEvent,\n  ignoreElements,\n  map,\n  repeat,\n  skip,\n  takeUntil,\n  tap\n} from \"rxjs\"\n\nimport { Viewport } from \"~/browser\"\n\nimport { Component } from \"../_\"\nimport { Header } from \"../header\"\nimport { Main } from \"../main\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Back-to-top button\n */\nexport interface BackToTop {\n  hidden: boolean                      /* Back-to-top button is hidden */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch options\n */\ninterface WatchOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  main$: Observable<Main>              /* Main area observable */\n  target$: Observable<HTMLElement>     /* Location target observable */\n}\n\n/**\n * Mount options\n */\ninterface MountOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  header$: Observable<Header>          /* Header observable */\n  main$: Observable<Main>              /* Main area observable */\n  target$: Observable<HTMLElement>     /* Location target observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Watch back-to-top\n *\n * @param _el - Back-to-top element\n * @param options - Options\n *\n * @returns Back-to-top observable\n */\nexport function watchBackToTop(\n  _el: HTMLElement, { viewport$, main$, target$ }: WatchOptions\n): Observable<BackToTop> {\n\n  /* Compute direction */\n  const direction$ = viewport$\n    .pipe(\n      map(({ offset: { y } }) => y),\n      bufferCount(2, 1),\n      map(([a, b]) => a > b && b > 0),\n      distinctUntilChanged()\n    )\n\n  /* Compute whether main area is active */\n  const active$ = main$\n    .pipe(\n      map(({ active }) => active)\n    )\n\n  /* Compute threshold for hiding */\n  return combineLatest([active$, direction$])\n    .pipe(\n      map(([active, direction]) => !(active && direction)),\n      distinctUntilChanged(),\n      takeUntil(target$.pipe(skip(1))),\n      endWith(true),\n      repeat({ delay: 250 }),\n      map(hidden => ({ hidden }))\n    )\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Mount back-to-top\n *\n * @param el - Back-to-top element\n * @param options - Options\n *\n * @returns Back-to-top component observable\n */\nexport function mountBackToTop(\n  el: HTMLElement, { viewport$, header$, main$, target$ }: MountOptions\n): Observable<Component<BackToTop>> {\n  const push$ = new Subject<BackToTop>()\n  const done$ = push$.pipe(ignoreElements(), endWith(true))\n  push$.subscribe({\n\n    /* Handle emission */\n    next({ hidden }) {\n      el.hidden = hidden\n      if (hidden) {\n        el.setAttribute(\"tabindex\", \"-1\")\n        el.blur()\n      } else {\n        el.removeAttribute(\"tabindex\")\n      }\n    },\n\n    /* Handle complete */\n    complete() {\n      el.style.top = \"\"\n      el.hidden = true\n      el.removeAttribute(\"tabindex\")\n    }\n  })\n\n  /* Watch header height */\n  header$\n    .pipe(\n      takeUntil(done$),\n      distinctUntilKeyChanged(\"height\")\n    )\n      .subscribe(({ height }) => {\n        el.style.top = `${height + 16}px`\n      })\n\n  /* Go back to top */\n  fromEvent(el, \"click\")\n    .subscribe(ev => {\n      ev.preventDefault()\n      window.scrollTo({ top: 0 })\n    })\n\n  /* Create and return component */\n  return watchBackToTop(el, { viewport$, main$, target$ })\n    .pipe(\n      tap(state => push$.next(state)),\n      finalize(() => push$.complete()),\n      map(state => ({ ref: el, ...state }))\n    )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  EMPTY,\n  Observable,\n  filter,\n  finalize,\n  map,\n  mergeMap,\n  skip,\n  switchMap,\n  take,\n  takeUntil\n} from \"rxjs\"\n\nimport { feature } from \"~/_\"\nimport {\n  Viewport,\n  getElements,\n  watchElementVisibility\n} from \"~/browser\"\nimport { mountInlineTooltip2 } from \"~/components/tooltip2\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Patch options\n */\ninterface PatchOptions {\n  document$: Observable<Document>      /* Document observable */\n  viewport$: Observable<Viewport>      /* Viewport observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Patch ellipsis\n *\n * This function will fetch all elements that are shortened with ellipsis, and\n * filter those which are visible. Once they become visible, they stay in that\n * state, even though they may be hidden again. This optimization is necessary\n * to reduce pressure on the browser, with elements fading in and out of view.\n *\n * @param options - Options\n */\nexport function patchEllipsis(\n  { document$, viewport$ }: PatchOptions\n): void {\n  document$\n    .pipe(\n      switchMap(() => getElements(\".md-ellipsis\")),\n      mergeMap(el => watchElementVisibility(el)\n        .pipe(\n          takeUntil(document$.pipe(skip(1))),\n          filter(visible => visible),\n          map(() => el),\n          take(1)\n        )\n      ),\n      filter(el => el.offsetWidth < el.scrollWidth),\n      mergeMap(el => {\n        const text = el.innerText\n        const host = el.closest(\"a\") || el\n        host.title = text\n\n        // Do not mount improved tooltip if feature is disabled\n        if (!feature(\"content.tooltips\"))\n          return EMPTY\n\n        /* Mount tooltip */\n        return mountInlineTooltip2(host, { viewport$ })\n          .pipe(\n            takeUntil(document$.pipe(skip(1))),\n            finalize(() => host.removeAttribute(\"title\"))\n          )\n      })\n    )\n      .subscribe()\n\n  // @todo move this outside of here and fix memleaks\n  if (feature(\"content.tooltips\"))\n    document$\n      .pipe(\n        switchMap(() => getElements(\".md-status\")),\n        mergeMap(el => mountInlineTooltip2(el, { viewport$ }))\n      )\n        .subscribe()\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  fromEvent,\n  map,\n  mergeMap,\n  switchMap,\n  takeWhile,\n  tap,\n  withLatestFrom\n} from \"rxjs\"\n\nimport { getElements } from \"~/browser\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Patch options\n */\ninterface PatchOptions {\n  document$: Observable<Document>      /* Document observable */\n  tablet$: Observable<boolean>         /* Media tablet observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Patch indeterminate checkboxes\n *\n * This function replaces the indeterminate \"pseudo state\" with the actual\n * indeterminate state, which is used to keep navigation always expanded.\n *\n * @param options - Options\n */\nexport function patchIndeterminate(\n  { document$, tablet$ }: PatchOptions\n): void {\n  document$\n    .pipe(\n      switchMap(() => getElements<HTMLInputElement>(\n        \".md-toggle--indeterminate\"\n      )),\n      tap(el => {\n        el.indeterminate = true\n        el.checked = false\n      }),\n      mergeMap(el => fromEvent(el, \"change\")\n        .pipe(\n          takeWhile(() => el.classList.contains(\"md-toggle--indeterminate\")),\n          map(() => el)\n        )\n      ),\n      withLatestFrom(tablet$)\n    )\n      .subscribe(([el, tablet]) => {\n        el.classList.remove(\"md-toggle--indeterminate\")\n        if (tablet)\n          el.checked = false\n      })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  filter,\n  fromEvent,\n  map,\n  mergeMap,\n  switchMap,\n  tap\n} from \"rxjs\"\n\nimport { getElements } from \"~/browser\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Patch options\n */\ninterface PatchOptions {\n  document$: Observable<Document>      /* Document observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Check whether the given device is an Apple device\n *\n * @returns Test result\n */\nfunction isAppleDevice(): boolean {\n  return /(iPad|iPhone|iPod)/.test(navigator.userAgent)\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Patch all elements with `data-md-scrollfix` attributes\n *\n * This is a year-old patch which ensures that overflow scrolling works at the\n * top and bottom of containers on iOS by ensuring a `1px` scroll offset upon\n * the start of a touch event.\n *\n * @see https://bit.ly/2SCtAOO - Original source\n *\n * @param options - Options\n */\nexport function patchScrollfix(\n  { document$ }: PatchOptions\n): void {\n  document$\n    .pipe(\n      switchMap(() => getElements(\"[data-md-scrollfix]\")),\n      tap(el => el.removeAttribute(\"data-md-scrollfix\")),\n      filter(isAppleDevice),\n      mergeMap(el => fromEvent(el, \"touchstart\")\n        .pipe(\n          map(() => el)\n        )\n      )\n    )\n      .subscribe(el => {\n        const top = el.scrollTop\n\n        /* We're at the top of the container */\n        if (top === 0) {\n          el.scrollTop = 1\n\n        /* We're at the bottom of the container */\n        } else if (top + el.offsetHeight === el.scrollHeight) {\n          el.scrollTop = top - 1\n        }\n      })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  Observable,\n  combineLatest,\n  delay,\n  map,\n  of,\n  switchMap,\n  withLatestFrom\n} from \"rxjs\"\n\nimport {\n  Viewport,\n  watchToggle\n} from \"~/browser\"\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Patch options\n */\ninterface PatchOptions {\n  viewport$: Observable<Viewport>      /* Viewport observable */\n  tablet$: Observable<boolean>         /* Media tablet observable */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Patch the document body to lock when search is open\n *\n * For mobile and tablet viewports, the search is rendered full screen, which\n * leads to scroll leaking when at the top or bottom of the search result. This\n * function locks the body when the search is in full screen mode, and restores\n * the scroll position when leaving.\n *\n * @param options - Options\n */\nexport function patchScrolllock(\n  { viewport$, tablet$ }: PatchOptions\n): void {\n  combineLatest([watchToggle(\"search\"), tablet$])\n    .pipe(\n      map(([active, tablet]) => active && !tablet),\n      switchMap(active => of(active)\n        .pipe(\n          delay(active ? 400 : 100)\n        )\n      ),\n      withLatestFrom(viewport$)\n    )\n      .subscribe(([active, { offset: { y }}]) => {\n        if (active) {\n          document.body.setAttribute(\"data-md-scrolllock\", \"\")\n          document.body.style.top = `-${y}px`\n        } else {\n          const value = -1 * parseInt(document.body.style.top, 10)\n          document.body.removeAttribute(\"data-md-scrolllock\")\n          document.body.style.top = \"\"\n          if (value)\n            window.scrollTo(0, value)\n        }\n      })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\n/* ----------------------------------------------------------------------------\n * Polyfills\n * ------------------------------------------------------------------------- */\n\n/* Polyfill `Object.entries` */\nif (!Object.entries)\n  Object.entries = function (obj: object) {\n    const data: [string, string][] = []\n    for (const key of Object.keys(obj))\n      // @ts-expect-error - ignore property access warning\n      data.push([key, obj[key]])\n\n    /* Return entries */\n    return data\n  }\n\n/* Polyfill `Object.values` */\nif (!Object.values)\n  Object.values = function (obj: object) {\n    const data: string[] = []\n    for (const key of Object.keys(obj))\n      // @ts-expect-error - ignore property access warning\n      data.push(obj[key])\n\n    /* Return values */\n    return data\n  }\n\n/* ------------------------------------------------------------------------- */\n\n/* Polyfills for `Element` */\nif (typeof Element !== \"undefined\") {\n\n  /* Polyfill `Element.scrollTo` */\n  if (!Element.prototype.scrollTo)\n    Element.prototype.scrollTo = function (\n      x?: ScrollToOptions | number, y?: number\n    ): void {\n      if (typeof x === \"object\") {\n        this.scrollLeft = x.left!\n        this.scrollTop = x.top!\n      } else {\n        this.scrollLeft = x!\n        this.scrollTop = y!\n      }\n    }\n\n  /* Polyfill `Element.replaceWith` */\n  if (!Element.prototype.replaceWith)\n    Element.prototype.replaceWith = function (\n      ...nodes: Array<string | Node>\n    ): void {\n      const parent = this.parentNode\n      if (parent) {\n        if (nodes.length === 0)\n          parent.removeChild(this)\n\n        /* Replace children and create text nodes */\n        for (let i = nodes.length - 1; i >= 0; i--) {\n          let node = nodes[i]\n          if (typeof node === \"string\")\n            node = document.createTextNode(node)\n          else if (node.parentNode)\n            node.parentNode.removeChild(node)\n\n          /* Replace child or insert before previous sibling */\n          if (!i)\n            parent.replaceChild(node, this)\n          else\n            parent.insertBefore(this.previousSibling!, node)\n        }\n      }\n    }\n}\n"],
+  "mappings": "2rCAAA,IAAAA,GAAAC,GAAA,CAAAC,GAAAC,KAAA,EAAC,SAAUC,EAAQC,EAAS,CAC1B,OAAOH,IAAY,UAAY,OAAOC,IAAW,YAAcE,EAAQ,EACvE,OAAO,QAAW,YAAc,OAAO,IAAM,OAAOA,CAAO,EAC1DA,EAAQ,CACX,GAAEH,GAAO,UAAY,CAAE,aASrB,SAASI,EAA0BC,EAAO,CACxC,IAAIC,EAAmB,GACnBC,EAA0B,GAC1BC,EAAiC,KAEjCC,EAAsB,CACxB,KAAM,GACN,OAAQ,GACR,IAAK,GACL,IAAK,GACL,MAAO,GACP,SAAU,GACV,OAAQ,GACR,KAAM,GACN,MAAO,GACP,KAAM,GACN,KAAM,GACN,SAAU,GACV,iBAAkB,EACpB,EAOA,SAASC,EAAmBC,EAAI,CAC9B,MACE,GAAAA,GACAA,IAAO,UACPA,EAAG,WAAa,QAChBA,EAAG,WAAa,QAChB,cAAeA,GACf,aAAcA,EAAG,UAKrB,CASA,SAASC,EAA8BD,EAAI,CACzC,IAAIE,GAAOF,EAAG,KACVG,GAAUH,EAAG,QAUjB,MARI,GAAAG,KAAY,SAAWL,EAAoBI,EAAI,GAAK,CAACF,EAAG,UAIxDG,KAAY,YAAc,CAACH,EAAG,UAI9BA,EAAG,kBAKT,CAOA,SAASI,EAAqBJ,EAAI,CAC5BA,EAAG,UAAU,SAAS,eAAe,IAGzCA,EAAG,UAAU,IAAI,eAAe,EAChCA,EAAG,aAAa,2BAA4B,EAAE,EAChD,CAOA,SAASK,EAAwBL,EAAI,CAC9BA,EAAG,aAAa,0BAA0B,IAG/CA,EAAG,UAAU,OAAO,eAAe,EACnCA,EAAG,gBAAgB,0BAA0B,EAC/C,CAUA,SAASM,EAAUC,EAAG,CAChBA,EAAE,SAAWA,EAAE,QAAUA,EAAE,UAI3BR,EAAmBL,EAAM,aAAa,GACxCU,EAAqBV,EAAM,aAAa,EAG1CC,EAAmB,GACrB,CAUA,SAASa,EAAcD,EAAG,CACxBZ,EAAmB,EACrB,CASA,SAASc,EAAQF,EAAG,CAEbR,EAAmBQ,EAAE,MAAM,IAI5BZ,GAAoBM,EAA8BM,EAAE,MAAM,IAC5DH,EAAqBG,EAAE,MAAM,CAEjC,CAMA,SAASG,EAAOH,EAAG,CACZR,EAAmBQ,EAAE,MAAM,IAK9BA,EAAE,OAAO,UAAU,SAAS,eAAe,GAC3CA,EAAE,OAAO,aAAa,0BAA0B,KAMhDX,EAA0B,GAC1B,OAAO,aAAaC,CAA8B,EAClDA,EAAiC,OAAO,WAAW,UAAW,CAC5DD,EAA0B,EAC5B,EAAG,GAAG,EACNS,EAAwBE,EAAE,MAAM,EAEpC,CAOA,SAASI,EAAmBJ,EAAG,CACzB,SAAS,kBAAoB,WAK3BX,IACFD,EAAmB,IAErBiB,GAA+B,EAEnC,CAQA,SAASA,IAAiC,CACxC,SAAS,iBAAiB,YAAaC,CAAoB,EAC3D,SAAS,iBAAiB,YAAaA,CAAoB,EAC3D,SAAS,iBAAiB,UAAWA,CAAoB,EACzD,SAAS,iBAAiB,cAAeA,CAAoB,EAC7D,SAAS,iBAAiB,cAAeA,CAAoB,EAC7D,SAAS,iBAAiB,YAAaA,CAAoB,EAC3D,SAAS,iBAAiB,YAAaA,CAAoB,EAC3D,SAAS,iBAAiB,aAAcA,CAAoB,EAC5D,SAAS,iBAAiB,WAAYA,CAAoB,CAC5D,CAEA,SAASC,IAAoC,CAC3C,SAAS,oBAAoB,YAAaD,CAAoB,EAC9D,SAAS,oBAAoB,YAAaA,CAAoB,EAC9D,SAAS,oBAAoB,UAAWA,CAAoB,EAC5D,SAAS,oBAAoB,cAAeA,CAAoB,EAChE,SAAS,oBAAoB,cAAeA,CAAoB,EAChE,SAAS,oBAAoB,YAAaA,CAAoB,EAC9D,SAAS,oBAAoB,YAAaA,CAAoB,EAC9D,SAAS,oBAAoB,aAAcA,CAAoB,EAC/D,SAAS,oBAAoB,WAAYA,CAAoB,CAC/D,CASA,SAASA,EAAqBN,EAAG,CAG3BA,EAAE,OAAO,UAAYA,EAAE,OAAO,SAAS,YAAY,IAAM,SAI7DZ,EAAmB,GACnBmB,GAAkC,EACpC,CAKA,SAAS,iBAAiB,UAAWR,EAAW,EAAI,EACpD,SAAS,iBAAiB,YAAaE,EAAe,EAAI,EAC1D,SAAS,iBAAiB,cAAeA,EAAe,EAAI,EAC5D,SAAS,iBAAiB,aAAcA,EAAe,EAAI,EAC3D,SAAS,iBAAiB,mBAAoBG,EAAoB,EAAI,EAEtEC,GAA+B,EAM/BlB,EAAM,iBAAiB,QAASe,EAAS,EAAI,EAC7Cf,EAAM,iBAAiB,OAAQgB,EAAQ,EAAI,EAOvChB,EAAM,WAAa,KAAK,wBAA0BA,EAAM,KAI1DA,EAAM,KAAK,aAAa,wBAAyB,EAAE,EAC1CA,EAAM,WAAa,KAAK,gBACjC,SAAS,gBAAgB,UAAU,IAAI,kBAAkB,EACzD,SAAS,gBAAgB,aAAa,wBAAyB,EAAE,EAErE,CAKA,GAAI,OAAO,QAAW,aAAe,OAAO,UAAa,YAAa,CAIpE,OAAO,0BAA4BD,EAInC,IAAIsB,EAEJ,GAAI,CACFA,EAAQ,IAAI,YAAY,8BAA8B,CACxD,OAASC,EAAO,CAEdD,EAAQ,SAAS,YAAY,aAAa,EAC1CA,EAAM,gBAAgB,+BAAgC,GAAO,GAAO,CAAC,CAAC,CACxE,CAEA,OAAO,cAAcA,CAAK,CAC5B,CAEI,OAAO,UAAa,aAGtBtB,EAA0B,QAAQ,CAGtC,CAAE,ICvTF,IAAAwB,GAAAC,GAAA,CAAAC,GAAAC,KAAA;AAAA;AAAA;AAAA;AAAA;AAAA,IAMC,SAA0CC,EAAMC,EAAS,CACtD,OAAOH,IAAY,UAAY,OAAOC,IAAW,SACnDA,GAAO,QAAUE,EAAQ,EAClB,OAAO,QAAW,YAAc,OAAO,IAC9C,OAAO,CAAC,EAAGA,CAAO,EACX,OAAOH,IAAY,SAC1BA,GAAQ,YAAiBG,EAAQ,EAEjCD,EAAK,YAAiBC,EAAQ,CAChC,GAAGH,GAAM,UAAW,CACpB,OAAiB,UAAW,CAClB,IAAII,EAAuB,CAE/B,IACC,SAASC,EAAyBC,EAAqBC,EAAqB,CAEnF,aAGAA,EAAoB,EAAED,EAAqB,CACzC,QAAW,UAAW,CAAE,OAAqBE,EAAW,CAC1D,CAAC,EAGD,IAAIC,EAAeF,EAAoB,GAAG,EACtCG,EAAoCH,EAAoB,EAAEE,CAAY,EAEtEE,EAASJ,EAAoB,GAAG,EAChCK,EAA8BL,EAAoB,EAAEI,CAAM,EAE1DE,EAAaN,EAAoB,GAAG,EACpCO,EAA8BP,EAAoB,EAAEM,CAAU,EAOlE,SAASE,EAAQC,EAAM,CACrB,GAAI,CACF,OAAO,SAAS,YAAYA,CAAI,CAClC,OAASC,EAAK,CACZ,MAAO,EACT,CACF,CAUA,IAAIC,EAAqB,SAA4BC,EAAQ,CAC3D,IAAIC,EAAeN,EAAe,EAAEK,CAAM,EAC1C,OAAAJ,EAAQ,KAAK,EACNK,CACT,EAEiCC,EAAeH,EAOhD,SAASI,EAAkBC,EAAO,CAChC,IAAIC,EAAQ,SAAS,gBAAgB,aAAa,KAAK,IAAM,MACzDC,EAAc,SAAS,cAAc,UAAU,EAEnDA,EAAY,MAAM,SAAW,OAE7BA,EAAY,MAAM,OAAS,IAC3BA,EAAY,MAAM,QAAU,IAC5BA,EAAY,MAAM,OAAS,IAE3BA,EAAY,MAAM,SAAW,WAC7BA,EAAY,MAAMD,EAAQ,QAAU,MAAM,EAAI,UAE9C,IAAIE,EAAY,OAAO,aAAe,SAAS,gBAAgB,UAC/D,OAAAD,EAAY,MAAM,IAAM,GAAG,OAAOC,EAAW,IAAI,EACjDD,EAAY,aAAa,WAAY,EAAE,EACvCA,EAAY,MAAQF,EACbE,CACT,CAYA,IAAIE,GAAiB,SAAwBJ,EAAOK,EAAS,CAC3D,IAAIH,EAAcH,EAAkBC,CAAK,EACzCK,EAAQ,UAAU,YAAYH,CAAW,EACzC,IAAIL,EAAeN,EAAe,EAAEW,CAAW,EAC/C,OAAAV,EAAQ,MAAM,EACdU,EAAY,OAAO,EACZL,CACT,EASIS,GAAsB,SAA6BV,EAAQ,CAC7D,IAAIS,EAAU,UAAU,OAAS,GAAK,UAAU,CAAC,IAAM,OAAY,UAAU,CAAC,EAAI,CAChF,UAAW,SAAS,IACtB,EACIR,EAAe,GAEnB,OAAI,OAAOD,GAAW,SACpBC,EAAeO,GAAeR,EAAQS,CAAO,EACpCT,aAAkB,kBAAoB,CAAC,CAAC,OAAQ,SAAU,MAAO,MAAO,UAAU,EAAE,SAASA,GAAW,KAA4B,OAASA,EAAO,IAAI,EAEjKC,EAAeO,GAAeR,EAAO,MAAOS,CAAO,GAEnDR,EAAeN,EAAe,EAAEK,CAAM,EACtCJ,EAAQ,MAAM,GAGTK,CACT,EAEiCU,EAAgBD,GAEjD,SAASE,EAAQC,EAAK,CAAE,0BAA2B,OAAI,OAAO,QAAW,YAAc,OAAO,OAAO,UAAa,SAAYD,EAAU,SAAiBC,EAAK,CAAE,OAAO,OAAOA,CAAK,EAAYD,EAAU,SAAiBC,EAAK,CAAE,OAAOA,GAAO,OAAO,QAAW,YAAcA,EAAI,cAAgB,QAAUA,IAAQ,OAAO,UAAY,SAAW,OAAOA,CAAK,EAAYD,EAAQC,CAAG,CAAG,CAUzX,IAAIC,GAAyB,UAAkC,CAC7D,IAAIL,EAAU,UAAU,OAAS,GAAK,UAAU,CAAC,IAAM,OAAY,UAAU,CAAC,EAAI,CAAC,EAE/EM,EAAkBN,EAAQ,OAC1BO,EAASD,IAAoB,OAAS,OAASA,EAC/CE,EAAYR,EAAQ,UACpBT,EAASS,EAAQ,OACjBS,GAAOT,EAAQ,KAEnB,GAAIO,IAAW,QAAUA,IAAW,MAClC,MAAM,IAAI,MAAM,oDAAoD,EAItE,GAAIhB,IAAW,OACb,GAAIA,GAAUY,EAAQZ,CAAM,IAAM,UAAYA,EAAO,WAAa,EAAG,CACnE,GAAIgB,IAAW,QAAUhB,EAAO,aAAa,UAAU,EACrD,MAAM,IAAI,MAAM,mFAAmF,EAGrG,GAAIgB,IAAW,QAAUhB,EAAO,aAAa,UAAU,GAAKA,EAAO,aAAa,UAAU,GACxF,MAAM,IAAI,MAAM,uGAAwG,CAE5H,KACE,OAAM,IAAI,MAAM,6CAA6C,EAKjE,GAAIkB,GACF,OAAOP,EAAaO,GAAM,CACxB,UAAWD,CACb,CAAC,EAIH,GAAIjB,EACF,OAAOgB,IAAW,MAAQd,EAAYF,CAAM,EAAIW,EAAaX,EAAQ,CACnE,UAAWiB,CACb,CAAC,CAEL,EAEiCE,GAAmBL,GAEpD,SAASM,GAAiBP,EAAK,CAAE,0BAA2B,OAAI,OAAO,QAAW,YAAc,OAAO,OAAO,UAAa,SAAYO,GAAmB,SAAiBP,EAAK,CAAE,OAAO,OAAOA,CAAK,EAAYO,GAAmB,SAAiBP,EAAK,CAAE,OAAOA,GAAO,OAAO,QAAW,YAAcA,EAAI,cAAgB,QAAUA,IAAQ,OAAO,UAAY,SAAW,OAAOA,CAAK,EAAYO,GAAiBP,CAAG,CAAG,CAE7Z,SAASQ,GAAgBC,EAAUC,EAAa,CAAE,GAAI,EAAED,aAAoBC,GAAgB,MAAM,IAAI,UAAU,mCAAmC,CAAK,CAExJ,SAASC,GAAkBxB,EAAQyB,EAAO,CAAE,QAASC,EAAI,EAAGA,EAAID,EAAM,OAAQC,IAAK,CAAE,IAAIC,EAAaF,EAAMC,CAAC,EAAGC,EAAW,WAAaA,EAAW,YAAc,GAAOA,EAAW,aAAe,GAAU,UAAWA,IAAYA,EAAW,SAAW,IAAM,OAAO,eAAe3B,EAAQ2B,EAAW,IAAKA,CAAU,CAAG,CAAE,CAE5T,SAASC,GAAaL,EAAaM,EAAYC,EAAa,CAAE,OAAID,GAAYL,GAAkBD,EAAY,UAAWM,CAAU,EAAOC,GAAaN,GAAkBD,EAAaO,CAAW,EAAUP,CAAa,CAEtN,SAASQ,GAAUC,EAAUC,EAAY,CAAE,GAAI,OAAOA,GAAe,YAAcA,IAAe,KAAQ,MAAM,IAAI,UAAU,oDAAoD,EAAKD,EAAS,UAAY,OAAO,OAAOC,GAAcA,EAAW,UAAW,CAAE,YAAa,CAAE,MAAOD,EAAU,SAAU,GAAM,aAAc,EAAK,CAAE,CAAC,EAAOC,GAAYC,GAAgBF,EAAUC,CAAU,CAAG,CAEhY,SAASC,GAAgBC,EAAGC,EAAG,CAAE,OAAAF,GAAkB,OAAO,gBAAkB,SAAyBC,EAAGC,EAAG,CAAE,OAAAD,EAAE,UAAYC,EAAUD,CAAG,EAAUD,GAAgBC,EAAGC,CAAC,CAAG,CAEzK,SAASC,GAAaC,EAAS,CAAE,IAAIC,EAA4BC,GAA0B,EAAG,OAAO,UAAgC,CAAE,IAAIC,EAAQC,GAAgBJ,CAAO,EAAGK,EAAQ,GAAIJ,EAA2B,CAAE,IAAIK,EAAYF,GAAgB,IAAI,EAAE,YAAaC,EAAS,QAAQ,UAAUF,EAAO,UAAWG,CAAS,CAAG,MAASD,EAASF,EAAM,MAAM,KAAM,SAAS,EAAK,OAAOI,GAA2B,KAAMF,CAAM,CAAG,CAAG,CAExa,SAASE,GAA2BC,EAAMC,EAAM,CAAE,OAAIA,IAAS3B,GAAiB2B,CAAI,IAAM,UAAY,OAAOA,GAAS,YAAsBA,EAAeC,GAAuBF,CAAI,CAAG,CAEzL,SAASE,GAAuBF,EAAM,CAAE,GAAIA,IAAS,OAAU,MAAM,IAAI,eAAe,2DAA2D,EAAK,OAAOA,CAAM,CAErK,SAASN,IAA4B,CAA0E,GAApE,OAAO,SAAY,aAAe,CAAC,QAAQ,WAA6B,QAAQ,UAAU,KAAM,MAAO,GAAO,GAAI,OAAO,OAAU,WAAY,MAAO,GAAM,GAAI,CAAE,YAAK,UAAU,SAAS,KAAK,QAAQ,UAAU,KAAM,CAAC,EAAG,UAAY,CAAC,CAAC,CAAC,EAAU,EAAM,OAASS,EAAG,CAAE,MAAO,EAAO,CAAE,CAEnU,SAASP,GAAgBP,EAAG,CAAE,OAAAO,GAAkB,OAAO,eAAiB,OAAO,eAAiB,SAAyBP,EAAG,CAAE,OAAOA,EAAE,WAAa,OAAO,eAAeA,CAAC,CAAG,EAAUO,GAAgBP,CAAC,CAAG,CAa5M,SAASe,GAAkBC,EAAQC,EAAS,CAC1C,IAAIC,EAAY,kBAAkB,OAAOF,CAAM,EAE/C,GAAKC,EAAQ,aAAaC,CAAS,EAInC,OAAOD,EAAQ,aAAaC,CAAS,CACvC,CAOA,IAAIC,GAAyB,SAAUC,EAAU,CAC/CxB,GAAUuB,EAAWC,CAAQ,EAE7B,IAAIC,EAASnB,GAAaiB,CAAS,EAMnC,SAASA,EAAUG,EAAShD,EAAS,CACnC,IAAIiD,EAEJ,OAAArC,GAAgB,KAAMiC,CAAS,EAE/BI,EAAQF,EAAO,KAAK,IAAI,EAExBE,EAAM,eAAejD,CAAO,EAE5BiD,EAAM,YAAYD,CAAO,EAElBC,CACT,CAQA,OAAA9B,GAAa0B,EAAW,CAAC,CACvB,IAAK,iBACL,MAAO,UAA0B,CAC/B,IAAI7C,EAAU,UAAU,OAAS,GAAK,UAAU,CAAC,IAAM,OAAY,UAAU,CAAC,EAAI,CAAC,EACnF,KAAK,OAAS,OAAOA,EAAQ,QAAW,WAAaA,EAAQ,OAAS,KAAK,cAC3E,KAAK,OAAS,OAAOA,EAAQ,QAAW,WAAaA,EAAQ,OAAS,KAAK,cAC3E,KAAK,KAAO,OAAOA,EAAQ,MAAS,WAAaA,EAAQ,KAAO,KAAK,YACrE,KAAK,UAAYW,GAAiBX,EAAQ,SAAS,IAAM,SAAWA,EAAQ,UAAY,SAAS,IACnG,CAMF,EAAG,CACD,IAAK,cACL,MAAO,SAAqBgD,EAAS,CACnC,IAAIE,EAAS,KAEb,KAAK,SAAWlE,EAAe,EAAEgE,EAAS,QAAS,SAAUR,GAAG,CAC9D,OAAOU,EAAO,QAAQV,EAAC,CACzB,CAAC,CACH,CAMF,EAAG,CACD,IAAK,UACL,MAAO,SAAiBA,EAAG,CACzB,IAAIQ,EAAUR,EAAE,gBAAkBA,EAAE,cAChCjC,GAAS,KAAK,OAAOyC,CAAO,GAAK,OACjCvC,GAAOC,GAAgB,CACzB,OAAQH,GACR,UAAW,KAAK,UAChB,OAAQ,KAAK,OAAOyC,CAAO,EAC3B,KAAM,KAAK,KAAKA,CAAO,CACzB,CAAC,EAED,KAAK,KAAKvC,GAAO,UAAY,QAAS,CACpC,OAAQF,GACR,KAAME,GACN,QAASuC,EACT,eAAgB,UAA0B,CACpCA,GACFA,EAAQ,MAAM,EAGhB,OAAO,aAAa,EAAE,gBAAgB,CACxC,CACF,CAAC,CACH,CAMF,EAAG,CACD,IAAK,gBACL,MAAO,SAAuBA,EAAS,CACrC,OAAOP,GAAkB,SAAUO,CAAO,CAC5C,CAMF,EAAG,CACD,IAAK,gBACL,MAAO,SAAuBA,EAAS,CACrC,IAAIG,EAAWV,GAAkB,SAAUO,CAAO,EAElD,GAAIG,EACF,OAAO,SAAS,cAAcA,CAAQ,CAE1C,CAQF,EAAG,CACD,IAAK,cAML,MAAO,SAAqBH,EAAS,CACnC,OAAOP,GAAkB,OAAQO,CAAO,CAC1C,CAKF,EAAG,CACD,IAAK,UACL,MAAO,UAAmB,CACxB,KAAK,SAAS,QAAQ,CACxB,CACF,CAAC,EAAG,CAAC,CACH,IAAK,OACL,MAAO,SAAczD,EAAQ,CAC3B,IAAIS,EAAU,UAAU,OAAS,GAAK,UAAU,CAAC,IAAM,OAAY,UAAU,CAAC,EAAI,CAChF,UAAW,SAAS,IACtB,EACA,OAAOE,EAAaX,EAAQS,CAAO,CACrC,CAOF,EAAG,CACD,IAAK,MACL,MAAO,SAAaT,EAAQ,CAC1B,OAAOE,EAAYF,CAAM,CAC3B,CAOF,EAAG,CACD,IAAK,cACL,MAAO,UAAuB,CAC5B,IAAIgB,EAAS,UAAU,OAAS,GAAK,UAAU,CAAC,IAAM,OAAY,UAAU,CAAC,EAAI,CAAC,OAAQ,KAAK,EAC3F6C,EAAU,OAAO7C,GAAW,SAAW,CAACA,CAAM,EAAIA,EAClD8C,GAAU,CAAC,CAAC,SAAS,sBACzB,OAAAD,EAAQ,QAAQ,SAAU7C,GAAQ,CAChC8C,GAAUA,IAAW,CAAC,CAAC,SAAS,sBAAsB9C,EAAM,CAC9D,CAAC,EACM8C,EACT,CACF,CAAC,CAAC,EAEKR,CACT,EAAG/D,EAAqB,CAAE,EAEOF,GAAaiE,EAExC,EAEA,IACC,SAASxE,EAAQ,CAExB,IAAIiF,EAAqB,EAKzB,GAAI,OAAO,SAAY,aAAe,CAAC,QAAQ,UAAU,QAAS,CAC9D,IAAIC,EAAQ,QAAQ,UAEpBA,EAAM,QAAUA,EAAM,iBACNA,EAAM,oBACNA,EAAM,mBACNA,EAAM,kBACNA,EAAM,qBAC1B,CASA,SAASC,EAASb,EAASQ,EAAU,CACjC,KAAOR,GAAWA,EAAQ,WAAaW,GAAoB,CACvD,GAAI,OAAOX,EAAQ,SAAY,YAC3BA,EAAQ,QAAQQ,CAAQ,EAC1B,OAAOR,EAETA,EAAUA,EAAQ,UACtB,CACJ,CAEAtE,EAAO,QAAUmF,CAGX,EAEA,IACC,SAASnF,EAAQoF,EAA0B9E,EAAqB,CAEvE,IAAI6E,EAAU7E,EAAoB,GAAG,EAYrC,SAAS+E,EAAUf,EAASQ,EAAU/D,EAAMuE,EAAUC,EAAY,CAC9D,IAAIC,EAAaC,EAAS,MAAM,KAAM,SAAS,EAE/C,OAAAnB,EAAQ,iBAAiBvD,EAAMyE,EAAYD,CAAU,EAE9C,CACH,QAAS,UAAW,CAChBjB,EAAQ,oBAAoBvD,EAAMyE,EAAYD,CAAU,CAC5D,CACJ,CACJ,CAYA,SAASG,EAASC,EAAUb,EAAU/D,EAAMuE,EAAUC,EAAY,CAE9D,OAAI,OAAOI,EAAS,kBAAqB,WAC9BN,EAAU,MAAM,KAAM,SAAS,EAItC,OAAOtE,GAAS,WAGTsE,EAAU,KAAK,KAAM,QAAQ,EAAE,MAAM,KAAM,SAAS,GAI3D,OAAOM,GAAa,WACpBA,EAAW,SAAS,iBAAiBA,CAAQ,GAI1C,MAAM,UAAU,IAAI,KAAKA,EAAU,SAAUrB,EAAS,CACzD,OAAOe,EAAUf,EAASQ,EAAU/D,EAAMuE,EAAUC,CAAU,CAClE,CAAC,EACL,CAWA,SAASE,EAASnB,EAASQ,EAAU/D,EAAMuE,EAAU,CACjD,OAAO,SAASnB,EAAG,CACfA,EAAE,eAAiBgB,EAAQhB,EAAE,OAAQW,CAAQ,EAEzCX,EAAE,gBACFmB,EAAS,KAAKhB,EAASH,CAAC,CAEhC,CACJ,CAEAnE,EAAO,QAAU0F,CAGX,EAEA,IACC,SAAStF,EAAyBL,EAAS,CAQlDA,EAAQ,KAAO,SAASuB,EAAO,CAC3B,OAAOA,IAAU,QACVA,aAAiB,aACjBA,EAAM,WAAa,CAC9B,EAQAvB,EAAQ,SAAW,SAASuB,EAAO,CAC/B,IAAIP,EAAO,OAAO,UAAU,SAAS,KAAKO,CAAK,EAE/C,OAAOA,IAAU,SACTP,IAAS,qBAAuBA,IAAS,4BACzC,WAAYO,IACZA,EAAM,SAAW,GAAKvB,EAAQ,KAAKuB,EAAM,CAAC,CAAC,EACvD,EAQAvB,EAAQ,OAAS,SAASuB,EAAO,CAC7B,OAAO,OAAOA,GAAU,UACjBA,aAAiB,MAC5B,EAQAvB,EAAQ,GAAK,SAASuB,EAAO,CACzB,IAAIP,EAAO,OAAO,UAAU,SAAS,KAAKO,CAAK,EAE/C,OAAOP,IAAS,mBACpB,CAGM,EAEA,IACC,SAASf,EAAQoF,EAA0B9E,EAAqB,CAEvE,IAAIsF,EAAKtF,EAAoB,GAAG,EAC5BoF,EAAWpF,EAAoB,GAAG,EAWtC,SAASI,EAAOQ,EAAQH,EAAMuE,EAAU,CACpC,GAAI,CAACpE,GAAU,CAACH,GAAQ,CAACuE,EACrB,MAAM,IAAI,MAAM,4BAA4B,EAGhD,GAAI,CAACM,EAAG,OAAO7E,CAAI,EACf,MAAM,IAAI,UAAU,kCAAkC,EAG1D,GAAI,CAAC6E,EAAG,GAAGN,CAAQ,EACf,MAAM,IAAI,UAAU,mCAAmC,EAG3D,GAAIM,EAAG,KAAK1E,CAAM,EACd,OAAO2E,EAAW3E,EAAQH,EAAMuE,CAAQ,EAEvC,GAAIM,EAAG,SAAS1E,CAAM,EACvB,OAAO4E,EAAe5E,EAAQH,EAAMuE,CAAQ,EAE3C,GAAIM,EAAG,OAAO1E,CAAM,EACrB,OAAO6E,EAAe7E,EAAQH,EAAMuE,CAAQ,EAG5C,MAAM,IAAI,UAAU,2EAA2E,CAEvG,CAWA,SAASO,EAAWG,EAAMjF,EAAMuE,EAAU,CACtC,OAAAU,EAAK,iBAAiBjF,EAAMuE,CAAQ,EAE7B,CACH,QAAS,UAAW,CAChBU,EAAK,oBAAoBjF,EAAMuE,CAAQ,CAC3C,CACJ,CACJ,CAWA,SAASQ,EAAeG,EAAUlF,EAAMuE,EAAU,CAC9C,aAAM,UAAU,QAAQ,KAAKW,EAAU,SAASD,EAAM,CAClDA,EAAK,iBAAiBjF,EAAMuE,CAAQ,CACxC,CAAC,EAEM,CACH,QAAS,UAAW,CAChB,MAAM,UAAU,QAAQ,KAAKW,EAAU,SAASD,EAAM,CAClDA,EAAK,oBAAoBjF,EAAMuE,CAAQ,CAC3C,CAAC,CACL,CACJ,CACJ,CAWA,SAASS,EAAejB,EAAU/D,EAAMuE,EAAU,CAC9C,OAAOI,EAAS,SAAS,KAAMZ,EAAU/D,EAAMuE,CAAQ,CAC3D,CAEAtF,EAAO,QAAUU,CAGX,EAEA,IACC,SAASV,EAAQ,CAExB,SAASkG,EAAO5B,EAAS,CACrB,IAAInD,EAEJ,GAAImD,EAAQ,WAAa,SACrBA,EAAQ,MAAM,EAEdnD,EAAemD,EAAQ,cAElBA,EAAQ,WAAa,SAAWA,EAAQ,WAAa,WAAY,CACtE,IAAI6B,EAAa7B,EAAQ,aAAa,UAAU,EAE3C6B,GACD7B,EAAQ,aAAa,WAAY,EAAE,EAGvCA,EAAQ,OAAO,EACfA,EAAQ,kBAAkB,EAAGA,EAAQ,MAAM,MAAM,EAE5C6B,GACD7B,EAAQ,gBAAgB,UAAU,EAGtCnD,EAAemD,EAAQ,KAC3B,KACK,CACGA,EAAQ,aAAa,iBAAiB,GACtCA,EAAQ,MAAM,EAGlB,IAAI8B,EAAY,OAAO,aAAa,EAChCC,EAAQ,SAAS,YAAY,EAEjCA,EAAM,mBAAmB/B,CAAO,EAChC8B,EAAU,gBAAgB,EAC1BA,EAAU,SAASC,CAAK,EAExBlF,EAAeiF,EAAU,SAAS,CACtC,CAEA,OAAOjF,CACX,CAEAnB,EAAO,QAAUkG,CAGX,EAEA,IACC,SAASlG,EAAQ,CAExB,SAASsG,GAAK,CAGd,CAEAA,EAAE,UAAY,CACZ,GAAI,SAAUC,EAAMjB,EAAUkB,EAAK,CACjC,IAAIrC,EAAI,KAAK,IAAM,KAAK,EAAI,CAAC,GAE7B,OAACA,EAAEoC,CAAI,IAAMpC,EAAEoC,CAAI,EAAI,CAAC,IAAI,KAAK,CAC/B,GAAIjB,EACJ,IAAKkB,CACP,CAAC,EAEM,IACT,EAEA,KAAM,SAAUD,EAAMjB,EAAUkB,EAAK,CACnC,IAAIxC,EAAO,KACX,SAASyB,GAAY,CACnBzB,EAAK,IAAIuC,EAAMd,CAAQ,EACvBH,EAAS,MAAMkB,EAAK,SAAS,CAC/B,CAEA,OAAAf,EAAS,EAAIH,EACN,KAAK,GAAGiB,EAAMd,EAAUe,CAAG,CACpC,EAEA,KAAM,SAAUD,EAAM,CACpB,IAAIE,EAAO,CAAC,EAAE,MAAM,KAAK,UAAW,CAAC,EACjCC,IAAW,KAAK,IAAM,KAAK,EAAI,CAAC,IAAIH,CAAI,GAAK,CAAC,GAAG,MAAM,EACvD3D,EAAI,EACJ+D,EAAMD,EAAO,OAEjB,IAAK9D,EAAGA,EAAI+D,EAAK/D,IACf8D,EAAO9D,CAAC,EAAE,GAAG,MAAM8D,EAAO9D,CAAC,EAAE,IAAK6D,CAAI,EAGxC,OAAO,IACT,EAEA,IAAK,SAAUF,EAAMjB,EAAU,CAC7B,IAAInB,EAAI,KAAK,IAAM,KAAK,EAAI,CAAC,GACzByC,EAAOzC,EAAEoC,CAAI,EACbM,EAAa,CAAC,EAElB,GAAID,GAAQtB,EACV,QAAS1C,EAAI,EAAG+D,EAAMC,EAAK,OAAQhE,EAAI+D,EAAK/D,IACtCgE,EAAKhE,CAAC,EAAE,KAAO0C,GAAYsB,EAAKhE,CAAC,EAAE,GAAG,IAAM0C,GAC9CuB,EAAW,KAAKD,EAAKhE,CAAC,CAAC,EAQ7B,OAACiE,EAAW,OACR1C,EAAEoC,CAAI,EAAIM,EACV,OAAO1C,EAAEoC,CAAI,EAEV,IACT,CACF,EAEAvG,EAAO,QAAUsG,EACjBtG,EAAO,QAAQ,YAAcsG,CAGvB,CAEI,EAGIQ,EAA2B,CAAC,EAGhC,SAASxG,EAAoByG,EAAU,CAEtC,GAAGD,EAAyBC,CAAQ,EACnC,OAAOD,EAAyBC,CAAQ,EAAE,QAG3C,IAAI/G,EAAS8G,EAAyBC,CAAQ,EAAI,CAGjD,QAAS,CAAC,CACX,EAGA,OAAA5G,EAAoB4G,CAAQ,EAAE/G,EAAQA,EAAO,QAASM,CAAmB,EAGlEN,EAAO,OACf,CAIA,OAAC,UAAW,CAEXM,EAAoB,EAAI,SAASN,EAAQ,CACxC,IAAIgH,EAAShH,GAAUA,EAAO,WAC7B,UAAW,CAAE,OAAOA,EAAO,OAAY,EACvC,UAAW,CAAE,OAAOA,CAAQ,EAC7B,OAAAM,EAAoB,EAAE0G,EAAQ,CAAE,EAAGA,CAAO,CAAC,EACpCA,CACR,CACD,EAAE,EAGD,UAAW,CAEX1G,EAAoB,EAAI,SAASP,EAASkH,EAAY,CACrD,QAAQC,KAAOD,EACX3G,EAAoB,EAAE2G,EAAYC,CAAG,GAAK,CAAC5G,EAAoB,EAAEP,EAASmH,CAAG,GAC/E,OAAO,eAAenH,EAASmH,EAAK,CAAE,WAAY,GAAM,IAAKD,EAAWC,CAAG,CAAE,CAAC,CAGjF,CACD,EAAE,EAGD,UAAW,CACX5G,EAAoB,EAAI,SAASyB,EAAKoF,EAAM,CAAE,OAAO,OAAO,UAAU,eAAe,KAAKpF,EAAKoF,CAAI,CAAG,CACvG,EAAE,EAMK7G,EAAoB,GAAG,CAC/B,EAAG,EACX,OACD,CAAC,ICz3BD,IAAA8G,GAAAC,GAAA,CAAAC,GAAAC,KAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,GAeA,IAAIC,GAAkB,UAOtBD,GAAO,QAAUE,GAUjB,SAASA,GAAWC,EAAQ,CAC1B,IAAIC,EAAM,GAAKD,EACXE,EAAQJ,GAAgB,KAAKG,CAAG,EAEpC,GAAI,CAACC,EACH,OAAOD,EAGT,IAAIE,EACAC,EAAO,GACPC,EAAQ,EACRC,EAAY,EAEhB,IAAKD,EAAQH,EAAM,MAAOG,EAAQJ,EAAI,OAAQI,IAAS,CACrD,OAAQJ,EAAI,WAAWI,CAAK,EAAG,CAC7B,IAAK,IACHF,EAAS,SACT,MACF,IAAK,IACHA,EAAS,QACT,MACF,IAAK,IACHA,EAAS,QACT,MACF,IAAK,IACHA,EAAS,OACT,MACF,IAAK,IACHA,EAAS,OACT,MACF,QACE,QACJ,CAEIG,IAAcD,IAChBD,GAAQH,EAAI,UAAUK,EAAWD,CAAK,GAGxCC,EAAYD,EAAQ,EACpBD,GAAQD,CACV,CAEA,OAAOG,IAAcD,EACjBD,EAAOH,EAAI,UAAUK,EAAWD,CAAK,EACrCD,CACN,ICvDA,IAAAG,GAAO,SCtBP;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,gFAgBA,IAAIC,GAAgB,SAASC,EAAGC,EAAG,CAC/B,OAAAF,GAAgB,OAAO,gBAClB,CAAE,UAAW,CAAC,CAAE,YAAa,OAAS,SAAUC,EAAGC,EAAG,CAAED,EAAE,UAAYC,CAAG,GAC1E,SAAUD,EAAGC,EAAG,CAAE,QAASC,KAAKD,EAAO,OAAO,UAAU,eAAe,KAAKA,EAAGC,CAAC,IAAGF,EAAEE,CAAC,EAAID,EAAEC,CAAC,EAAG,EAC7FH,GAAcC,EAAGC,CAAC,CAC7B,EAEO,SAASE,GAAUH,EAAGC,EAAG,CAC5B,GAAI,OAAOA,GAAM,YAAcA,IAAM,KACjC,MAAM,IAAI,UAAU,uBAAyB,OAAOA,CAAC,EAAI,+BAA+B,EAC5FF,GAAcC,EAAGC,CAAC,EAClB,SAASG,GAAK,CAAE,KAAK,YAAcJ,CAAG,CACtCA,EAAE,UAAYC,IAAM,KAAO,OAAO,OAAOA,CAAC,GAAKG,EAAG,UAAYH,EAAE,UAAW,IAAIG,EACnF,CAwCO,SAASC,GAAUC,EAASC,EAAYC,EAAGC,EAAW,CACzD,SAASC,EAAMC,EAAO,CAAE,OAAOA,aAAiBH,EAAIG,EAAQ,IAAIH,EAAE,SAAUI,EAAS,CAAEA,EAAQD,CAAK,CAAG,CAAC,CAAG,CAC3G,OAAO,IAAKH,IAAMA,EAAI,UAAU,SAAUI,EAASC,EAAQ,CACvD,SAASC,EAAUH,EAAO,CAAE,GAAI,CAAEI,EAAKN,EAAU,KAAKE,CAAK,CAAC,CAAG,OAASK,EAAG,CAAEH,EAAOG,CAAC,CAAG,CAAE,CAC1F,SAASC,EAASN,EAAO,CAAE,GAAI,CAAEI,EAAKN,EAAU,MAASE,CAAK,CAAC,CAAG,OAASK,EAAG,CAAEH,EAAOG,CAAC,CAAG,CAAE,CAC7F,SAASD,EAAKG,EAAQ,CAAEA,EAAO,KAAON,EAAQM,EAAO,KAAK,EAAIR,EAAMQ,EAAO,KAAK,EAAE,KAAKJ,EAAWG,CAAQ,CAAG,CAC7GF,GAAMN,EAAYA,EAAU,MAAMH,EAASC,GAAc,CAAC,CAAC,GAAG,KAAK,CAAC,CACxE,CAAC,CACL,CAEO,SAASY,GAAYb,EAASc,EAAM,CACvC,IAAIC,EAAI,CAAE,MAAO,EAAG,KAAM,UAAW,CAAE,GAAIC,EAAE,CAAC,EAAI,EAAG,MAAMA,EAAE,CAAC,EAAG,OAAOA,EAAE,CAAC,CAAG,EAAG,KAAM,CAAC,EAAG,IAAK,CAAC,CAAE,EAAGC,EAAGC,EAAGF,EAAGG,EAC/G,OAAOA,EAAI,CAAE,KAAMC,EAAK,CAAC,EAAG,MAASA,EAAK,CAAC,EAAG,OAAUA,EAAK,CAAC,CAAE,EAAG,OAAO,QAAW,aAAeD,EAAE,OAAO,QAAQ,EAAI,UAAW,CAAE,OAAO,IAAM,GAAIA,EACvJ,SAASC,EAAKC,EAAG,CAAE,OAAO,SAAUC,EAAG,CAAE,OAAOb,EAAK,CAACY,EAAGC,CAAC,CAAC,CAAG,CAAG,CACjE,SAASb,EAAKc,EAAI,CACd,GAAIN,EAAG,MAAM,IAAI,UAAU,iCAAiC,EAC5D,KAAOF,GAAG,GAAI,CACV,GAAIE,EAAI,EAAGC,IAAMF,EAAIO,EAAG,CAAC,EAAI,EAAIL,EAAE,OAAYK,EAAG,CAAC,EAAIL,EAAE,SAAcF,EAAIE,EAAE,SAAcF,EAAE,KAAKE,CAAC,EAAG,GAAKA,EAAE,OAAS,EAAEF,EAAIA,EAAE,KAAKE,EAAGK,EAAG,CAAC,CAAC,GAAG,KAAM,OAAOP,EAE3J,OADIE,EAAI,EAAGF,IAAGO,EAAK,CAACA,EAAG,CAAC,EAAI,EAAGP,EAAE,KAAK,GAC9BO,EAAG,CAAC,EAAG,CACX,IAAK,GAAG,IAAK,GAAGP,EAAIO,EAAI,MACxB,IAAK,GAAG,OAAAR,EAAE,QAAgB,CAAE,MAAOQ,EAAG,CAAC,EAAG,KAAM,EAAM,EACtD,IAAK,GAAGR,EAAE,QAASG,EAAIK,EAAG,CAAC,EAAGA,EAAK,CAAC,CAAC,EAAG,SACxC,IAAK,GAAGA,EAAKR,EAAE,IAAI,IAAI,EAAGA,EAAE,KAAK,IAAI,EAAG,SACxC,QACI,GAAMC,EAAID,EAAE,KAAM,EAAAC,EAAIA,EAAE,OAAS,GAAKA,EAAEA,EAAE,OAAS,CAAC,KAAOO,EAAG,CAAC,IAAM,GAAKA,EAAG,CAAC,IAAM,GAAI,CAAER,EAAI,EAAG,QAAU,CAC3G,GAAIQ,EAAG,CAAC,IAAM,IAAM,CAACP,GAAMO,EAAG,CAAC,EAAIP,EAAE,CAAC,GAAKO,EAAG,CAAC,EAAIP,EAAE,CAAC,GAAK,CAAED,EAAE,MAAQQ,EAAG,CAAC,EAAG,KAAO,CACrF,GAAIA,EAAG,CAAC,IAAM,GAAKR,EAAE,MAAQC,EAAE,CAAC,EAAG,CAAED,EAAE,MAAQC,EAAE,CAAC,EAAGA,EAAIO,EAAI,KAAO,CACpE,GAAIP,GAAKD,EAAE,MAAQC,EAAE,CAAC,EAAG,CAAED,EAAE,MAAQC,EAAE,CAAC,EAAGD,EAAE,IAAI,KAAKQ,CAAE,EAAG,KAAO,CAC9DP,EAAE,CAAC,GAAGD,EAAE,IAAI,IAAI,EACpBA,EAAE,KAAK,IAAI,EAAG,QACtB,CACAQ,EAAKT,EAAK,KAAKd,EAASe,CAAC,CAC7B,OAASL,EAAG,CAAEa,EAAK,CAAC,EAAGb,CAAC,EAAGQ,EAAI,CAAG,QAAE,CAAUD,EAAID,EAAI,CAAG,CACzD,GAAIO,EAAG,CAAC,EAAI,EAAG,MAAMA,EAAG,CAAC,EAAG,MAAO,CAAE,MAAOA,EAAG,CAAC,EAAIA,EAAG,CAAC,EAAI,OAAQ,KAAM,EAAK,CACnF,CACJ,CAcO,SAASC,GAASC,EAAG,CACxB,IAAIC,EAAI,OAAO,QAAW,YAAc,OAAO,SAAUC,EAAID,GAAKD,EAAEC,CAAC,EAAGE,EAAI,EAC5E,GAAID,EAAG,OAAOA,EAAE,KAAKF,CAAC,EACtB,GAAIA,GAAK,OAAOA,EAAE,QAAW,SAAU,MAAO,CAC1C,KAAM,UAAY,CACd,OAAIA,GAAKG,GAAKH,EAAE,SAAQA,EAAI,QACrB,CAAE,MAAOA,GAAKA,EAAEG,GAAG,EAAG,KAAM,CAACH,CAAE,CAC1C,CACJ,EACA,MAAM,IAAI,UAAUC,EAAI,0BAA4B,iCAAiC,CACzF,CAEO,SAASG,EAAOJ,EAAGK,EAAG,CACzB,IAAIH,EAAI,OAAO,QAAW,YAAcF,EAAE,OAAO,QAAQ,EACzD,GAAI,CAACE,EAAG,OAAOF,EACf,IAAIG,EAAID,EAAE,KAAKF,CAAC,EAAGM,EAAGC,EAAK,CAAC,EAAGC,EAC/B,GAAI,CACA,MAAQH,IAAM,QAAUA,KAAM,IAAM,EAAEC,EAAIH,EAAE,KAAK,GAAG,MAAMI,EAAG,KAAKD,EAAE,KAAK,CAC7E,OACOG,EAAO,CAAED,EAAI,CAAE,MAAOC,CAAM,CAAG,QACtC,CACI,GAAI,CACIH,GAAK,CAACA,EAAE,OAASJ,EAAIC,EAAE,SAAYD,EAAE,KAAKC,CAAC,CACnD,QACA,CAAU,GAAIK,EAAG,MAAMA,EAAE,KAAO,CACpC,CACA,OAAOD,CACX,CAkBO,SAASG,EAAcC,EAAIC,EAAMC,EAAM,CAC1C,GAAIA,GAAQ,UAAU,SAAW,EAAG,QAASC,EAAI,EAAGC,EAAIH,EAAK,OAAQI,EAAIF,EAAIC,EAAGD,KACxEE,GAAM,EAAEF,KAAKF,MACRI,IAAIA,EAAK,MAAM,UAAU,MAAM,KAAKJ,EAAM,EAAGE,CAAC,GACnDE,EAAGF,CAAC,EAAIF,EAAKE,CAAC,GAGtB,OAAOH,EAAG,OAAOK,GAAM,MAAM,UAAU,MAAM,KAAKJ,CAAI,CAAC,CAC3D,CAEO,SAASK,GAAQC,EAAG,CACvB,OAAO,gBAAgBD,IAAW,KAAK,EAAIC,EAAG,MAAQ,IAAID,GAAQC,CAAC,CACvE,CAEO,SAASC,GAAiBC,EAASC,EAAYC,EAAW,CAC7D,GAAI,CAAC,OAAO,cAAe,MAAM,IAAI,UAAU,sCAAsC,EACrF,IAAIC,EAAID,EAAU,MAAMF,EAASC,GAAc,CAAC,CAAC,EAAGP,EAAGU,EAAI,CAAC,EAC5D,OAAOV,EAAI,CAAC,EAAGW,EAAK,MAAM,EAAGA,EAAK,OAAO,EAAGA,EAAK,QAAQ,EAAGX,EAAE,OAAO,aAAa,EAAI,UAAY,CAAE,OAAO,IAAM,EAAGA,EACpH,SAASW,EAAKC,EAAG,CAAMH,EAAEG,CAAC,IAAGZ,EAAEY,CAAC,EAAI,SAAUR,EAAG,CAAE,OAAO,IAAI,QAAQ,SAAUS,EAAGC,EAAG,CAAEJ,EAAE,KAAK,CAACE,EAAGR,EAAGS,EAAGC,CAAC,CAAC,EAAI,GAAKC,EAAOH,EAAGR,CAAC,CAAG,CAAC,CAAG,EAAG,CACzI,SAASW,EAAOH,EAAGR,EAAG,CAAE,GAAI,CAAEY,EAAKP,EAAEG,CAAC,EAAER,CAAC,CAAC,CAAG,OAASa,EAAG,CAAEC,EAAOR,EAAE,CAAC,EAAE,CAAC,EAAGO,CAAC,CAAG,CAAE,CACjF,SAASD,EAAKG,EAAG,CAAEA,EAAE,iBAAiBhB,GAAU,QAAQ,QAAQgB,EAAE,MAAM,CAAC,EAAE,KAAKC,EAASC,CAAM,EAAIH,EAAOR,EAAE,CAAC,EAAE,CAAC,EAAGS,CAAC,CAAG,CACvH,SAASC,EAAQE,EAAO,CAAEP,EAAO,OAAQO,CAAK,CAAG,CACjD,SAASD,EAAOC,EAAO,CAAEP,EAAO,QAASO,CAAK,CAAG,CACjD,SAASJ,EAAOK,EAAGnB,EAAG,CAAMmB,EAAEnB,CAAC,EAAGM,EAAE,MAAM,EAAGA,EAAE,QAAQK,EAAOL,EAAE,CAAC,EAAE,CAAC,EAAGA,EAAE,CAAC,EAAE,CAAC,CAAC,CAAG,CACrF,CAQO,SAASc,GAAcC,EAAG,CAC7B,GAAI,CAAC,OAAO,cAAe,MAAM,IAAI,UAAU,sCAAsC,EACrF,IAAIC,EAAID,EAAE,OAAO,aAAa,EAAGE,EACjC,OAAOD,EAAIA,EAAE,KAAKD,CAAC,GAAKA,EAAI,OAAOG,IAAa,WAAaA,GAASH,CAAC,EAAIA,EAAE,OAAO,QAAQ,EAAE,EAAGE,EAAI,CAAC,EAAGE,EAAK,MAAM,EAAGA,EAAK,OAAO,EAAGA,EAAK,QAAQ,EAAGF,EAAE,OAAO,aAAa,EAAI,UAAY,CAAE,OAAO,IAAM,EAAGA,GAC9M,SAASE,EAAKC,EAAG,CAAEH,EAAEG,CAAC,EAAIL,EAAEK,CAAC,GAAK,SAAUC,EAAG,CAAE,OAAO,IAAI,QAAQ,SAAUC,EAASC,EAAQ,CAAEF,EAAIN,EAAEK,CAAC,EAAEC,CAAC,EAAGG,EAAOF,EAASC,EAAQF,EAAE,KAAMA,EAAE,KAAK,CAAG,CAAC,CAAG,CAAG,CAC/J,SAASG,EAAOF,EAASC,EAAQE,EAAGJ,EAAG,CAAE,QAAQ,QAAQA,CAAC,EAAE,KAAK,SAASA,EAAG,CAAEC,EAAQ,CAAE,MAAOD,EAAG,KAAMI,CAAE,CAAC,CAAG,EAAGF,CAAM,CAAG,CAC/H,CCtMM,SAAUG,EAAWC,EAAU,CACnC,OAAO,OAAOA,GAAU,UAC1B,CCGM,SAAUC,GAAoBC,EAAgC,CAClE,IAAMC,EAAS,SAACC,EAAa,CAC3B,MAAM,KAAKA,CAAQ,EACnBA,EAAS,MAAQ,IAAI,MAAK,EAAG,KAC/B,EAEMC,EAAWH,EAAWC,CAAM,EAClC,OAAAE,EAAS,UAAY,OAAO,OAAO,MAAM,SAAS,EAClDA,EAAS,UAAU,YAAcA,EAC1BA,CACT,CCDO,IAAMC,GAA+CC,GAC1D,SAACC,EAAM,CACL,OAAA,SAA4CC,EAA0B,CACpED,EAAO,IAAI,EACX,KAAK,QAAUC,EACRA,EAAO,OAAM;EACxBA,EAAO,IAAI,SAACC,EAAKC,EAAC,CAAK,OAAGA,EAAI,EAAC,KAAKD,EAAI,SAAQ,CAAzB,CAA6B,EAAE,KAAK;GAAM,EACzD,GACJ,KAAK,KAAO,sBACZ,KAAK,OAASD,CAChB,CARA,CAQC,ECvBC,SAAUG,GAAaC,EAA6BC,EAAO,CAC/D,GAAID,EAAK,CACP,IAAME,EAAQF,EAAI,QAAQC,CAAI,EAC9B,GAAKC,GAASF,EAAI,OAAOE,EAAO,CAAC,EAErC,CCOA,IAAAC,GAAA,UAAA,CAyBE,SAAAA,EAAoBC,EAA4B,CAA5B,KAAA,gBAAAA,EAdb,KAAA,OAAS,GAER,KAAA,WAAmD,KAMnD,KAAA,YAAqD,IAMV,CAQnD,OAAAD,EAAA,UAAA,YAAA,UAAA,aACME,EAEJ,GAAI,CAAC,KAAK,OAAQ,CAChB,KAAK,OAAS,GAGN,IAAAC,EAAe,KAAI,WAC3B,GAAIA,EAEF,GADA,KAAK,WAAa,KACd,MAAM,QAAQA,CAAU,MAC1B,QAAqBC,EAAAC,GAAAF,CAAU,EAAAG,EAAAF,EAAA,KAAA,EAAA,CAAAE,EAAA,KAAAA,EAAAF,EAAA,KAAA,EAAE,CAA5B,IAAMG,EAAMD,EAAA,MACfC,EAAO,OAAO,IAAI,yGAGpBJ,EAAW,OAAO,IAAI,EAIlB,IAAiBK,EAAqB,KAAI,gBAClD,GAAIC,EAAWD,CAAgB,EAC7B,GAAI,CACFA,EAAgB,QACTE,EAAG,CACVR,EAASQ,aAAaC,GAAsBD,EAAE,OAAS,CAACA,CAAC,EAIrD,IAAAE,EAAgB,KAAI,YAC5B,GAAIA,EAAa,CACf,KAAK,YAAc,SACnB,QAAwBC,EAAAR,GAAAO,CAAW,EAAAE,EAAAD,EAAA,KAAA,EAAA,CAAAC,EAAA,KAAAA,EAAAD,EAAA,KAAA,EAAE,CAAhC,IAAME,EAASD,EAAA,MAClB,GAAI,CACFE,GAAcD,CAAS,QAChBE,EAAK,CACZf,EAASA,GAAM,KAANA,EAAU,CAAA,EACfe,aAAeN,GACjBT,EAAMgB,EAAAA,EAAA,CAAA,EAAAC,EAAOjB,CAAM,CAAA,EAAAiB,EAAKF,EAAI,MAAM,CAAA,EAElCf,EAAO,KAAKe,CAAG,sGAMvB,GAAIf,EACF,MAAM,IAAIS,GAAoBT,CAAM,EAG1C,EAoBAF,EAAA,UAAA,IAAA,SAAIoB,EAAuB,OAGzB,GAAIA,GAAYA,IAAa,KAC3B,GAAI,KAAK,OAGPJ,GAAcI,CAAQ,MACjB,CACL,GAAIA,aAAoBpB,EAAc,CAGpC,GAAIoB,EAAS,QAAUA,EAAS,WAAW,IAAI,EAC7C,OAEFA,EAAS,WAAW,IAAI,GAEzB,KAAK,aAAcC,EAAA,KAAK,eAAW,MAAAA,IAAA,OAAAA,EAAI,CAAA,GAAI,KAAKD,CAAQ,EAG/D,EAOQpB,EAAA,UAAA,WAAR,SAAmBsB,EAAoB,CAC7B,IAAAnB,EAAe,KAAI,WAC3B,OAAOA,IAAemB,GAAW,MAAM,QAAQnB,CAAU,GAAKA,EAAW,SAASmB,CAAM,CAC1F,EASQtB,EAAA,UAAA,WAAR,SAAmBsB,EAAoB,CAC7B,IAAAnB,EAAe,KAAI,WAC3B,KAAK,WAAa,MAAM,QAAQA,CAAU,GAAKA,EAAW,KAAKmB,CAAM,EAAGnB,GAAcA,EAAa,CAACA,EAAYmB,CAAM,EAAIA,CAC5H,EAMQtB,EAAA,UAAA,cAAR,SAAsBsB,EAAoB,CAChC,IAAAnB,EAAe,KAAI,WACvBA,IAAemB,EACjB,KAAK,WAAa,KACT,MAAM,QAAQnB,CAAU,GACjCoB,GAAUpB,EAAYmB,CAAM,CAEhC,EAgBAtB,EAAA,UAAA,OAAA,SAAOoB,EAAsC,CACnC,IAAAR,EAAgB,KAAI,YAC5BA,GAAeW,GAAUX,EAAaQ,CAAQ,EAE1CA,aAAoBpB,GACtBoB,EAAS,cAAc,IAAI,CAE/B,EAlLcpB,EAAA,MAAS,UAAA,CACrB,IAAMwB,EAAQ,IAAIxB,EAClB,OAAAwB,EAAM,OAAS,GACRA,CACT,EAAE,EA+KJxB,GArLA,EAuLO,IAAMyB,GAAqBC,GAAa,MAEzC,SAAUC,GAAeC,EAAU,CACvC,OACEA,aAAiBF,IAChBE,GAAS,WAAYA,GAASC,EAAWD,EAAM,MAAM,GAAKC,EAAWD,EAAM,GAAG,GAAKC,EAAWD,EAAM,WAAW,CAEpH,CAEA,SAASE,GAAcC,EAAwC,CACzDF,EAAWE,CAAS,EACtBA,EAAS,EAETA,EAAU,YAAW,CAEzB,CChNO,IAAMC,GAAuB,CAClC,iBAAkB,KAClB,sBAAuB,KACvB,QAAS,OACT,sCAAuC,GACvC,yBAA0B,ICGrB,IAAMC,GAAmC,CAG9C,WAAA,SAAWC,EAAqBC,EAAgB,SAAEC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,EAAA,CAAA,EAAA,UAAAA,CAAA,EACxC,IAAAC,EAAaL,GAAe,SACpC,OAAIK,GAAQ,MAARA,EAAU,WACLA,EAAS,WAAU,MAAnBA,EAAQC,EAAA,CAAYL,EAASC,CAAO,EAAAK,EAAKJ,CAAI,CAAA,CAAA,EAE/C,WAAU,MAAA,OAAAG,EAAA,CAACL,EAASC,CAAO,EAAAK,EAAKJ,CAAI,CAAA,CAAA,CAC7C,EACA,aAAA,SAAaK,EAAM,CACT,IAAAH,EAAaL,GAAe,SACpC,QAAQK,GAAQ,KAAA,OAARA,EAAU,eAAgB,cAAcG,CAAa,CAC/D,EACA,SAAU,QCjBN,SAAUC,GAAqBC,EAAQ,CAC3CC,GAAgB,WAAW,UAAA,CACjB,IAAAC,EAAqBC,GAAM,iBACnC,GAAID,EAEFA,EAAiBF,CAAG,MAGpB,OAAMA,CAEV,CAAC,CACH,CCtBM,SAAUI,IAAI,CAAK,CCMlB,IAAMC,GAAyB,UAAA,CAAM,OAAAC,GAAmB,IAAK,OAAW,MAAS,CAA5C,EAAsE,EAO5G,SAAUC,GAAkBC,EAAU,CAC1C,OAAOF,GAAmB,IAAK,OAAWE,CAAK,CACjD,CAOM,SAAUC,GAAoBC,EAAQ,CAC1C,OAAOJ,GAAmB,IAAKI,EAAO,MAAS,CACjD,CAQM,SAAUJ,GAAmBK,EAAuBD,EAAYF,EAAU,CAC9E,MAAO,CACL,KAAIG,EACJ,MAAKD,EACL,MAAKF,EAET,CCrCA,IAAII,GAAuD,KASrD,SAAUC,GAAaC,EAAc,CACzC,GAAIC,GAAO,sCAAuC,CAChD,IAAMC,EAAS,CAACJ,GAKhB,GAJII,IACFJ,GAAU,CAAE,YAAa,GAAO,MAAO,IAAI,GAE7CE,EAAE,EACEE,EAAQ,CACJ,IAAAC,EAAyBL,GAAvBM,EAAWD,EAAA,YAAEE,EAAKF,EAAA,MAE1B,GADAL,GAAU,KACNM,EACF,MAAMC,QAMVL,EAAE,CAEN,CAMM,SAAUM,GAAaC,EAAQ,CAC/BN,GAAO,uCAAyCH,KAClDA,GAAQ,YAAc,GACtBA,GAAQ,MAAQS,EAEpB,CCrBA,IAAAC,GAAA,SAAAC,EAAA,CAAmCC,GAAAF,EAAAC,CAAA,EA6BjC,SAAAD,EAAYG,EAA6C,CAAzD,IAAAC,EACEH,EAAA,KAAA,IAAA,GAAO,KATC,OAAAG,EAAA,UAAqB,GAUzBD,GACFC,EAAK,YAAcD,EAGfE,GAAeF,CAAW,GAC5BA,EAAY,IAAIC,CAAI,GAGtBA,EAAK,YAAcE,IAEvB,CAzBO,OAAAN,EAAA,OAAP,SAAiBO,EAAwBC,EAA2BC,EAAqB,CACvF,OAAO,IAAIC,GAAeH,EAAMC,EAAOC,CAAQ,CACjD,EAgCAT,EAAA,UAAA,KAAA,SAAKW,EAAS,CACR,KAAK,UACPC,GAA0BC,GAAiBF,CAAK,EAAG,IAAI,EAEvD,KAAK,MAAMA,CAAM,CAErB,EASAX,EAAA,UAAA,MAAA,SAAMc,EAAS,CACT,KAAK,UACPF,GAA0BG,GAAkBD,CAAG,EAAG,IAAI,GAEtD,KAAK,UAAY,GACjB,KAAK,OAAOA,CAAG,EAEnB,EAQAd,EAAA,UAAA,SAAA,UAAA,CACM,KAAK,UACPY,GAA0BI,GAAuB,IAAI,GAErD,KAAK,UAAY,GACjB,KAAK,UAAS,EAElB,EAEAhB,EAAA,UAAA,YAAA,UAAA,CACO,KAAK,SACR,KAAK,UAAY,GACjBC,EAAA,UAAM,YAAW,KAAA,IAAA,EACjB,KAAK,YAAc,KAEvB,EAEUD,EAAA,UAAA,MAAV,SAAgBW,EAAQ,CACtB,KAAK,YAAY,KAAKA,CAAK,CAC7B,EAEUX,EAAA,UAAA,OAAV,SAAiBc,EAAQ,CACvB,GAAI,CACF,KAAK,YAAY,MAAMA,CAAG,UAE1B,KAAK,YAAW,EAEpB,EAEUd,EAAA,UAAA,UAAV,UAAA,CACE,GAAI,CACF,KAAK,YAAY,SAAQ,UAEzB,KAAK,YAAW,EAEpB,EACFA,CAAA,EApHmCiB,EAAY,EA2H/C,IAAMC,GAAQ,SAAS,UAAU,KAEjC,SAASC,GAAyCC,EAAQC,EAAY,CACpE,OAAOH,GAAM,KAAKE,EAAIC,CAAO,CAC/B,CAMA,IAAAC,GAAA,UAAA,CACE,SAAAA,EAAoBC,EAAqC,CAArC,KAAA,gBAAAA,CAAwC,CAE5D,OAAAD,EAAA,UAAA,KAAA,SAAKE,EAAQ,CACH,IAAAD,EAAoB,KAAI,gBAChC,GAAIA,EAAgB,KAClB,GAAI,CACFA,EAAgB,KAAKC,CAAK,QACnBC,EAAO,CACdC,GAAqBD,CAAK,EAGhC,EAEAH,EAAA,UAAA,MAAA,SAAMK,EAAQ,CACJ,IAAAJ,EAAoB,KAAI,gBAChC,GAAIA,EAAgB,MAClB,GAAI,CACFA,EAAgB,MAAMI,CAAG,QAClBF,EAAO,CACdC,GAAqBD,CAAK,OAG5BC,GAAqBC,CAAG,CAE5B,EAEAL,EAAA,UAAA,SAAA,UAAA,CACU,IAAAC,EAAoB,KAAI,gBAChC,GAAIA,EAAgB,SAClB,GAAI,CACFA,EAAgB,SAAQ,QACjBE,EAAO,CACdC,GAAqBD,CAAK,EAGhC,EACFH,CAAA,EArCA,EAuCAM,GAAA,SAAAC,EAAA,CAAuCC,GAAAF,EAAAC,CAAA,EACrC,SAAAD,EACEG,EACAN,EACAO,EAA8B,CAHhC,IAAAC,EAKEJ,EAAA,KAAA,IAAA,GAAO,KAEHN,EACJ,GAAIW,EAAWH,CAAc,GAAK,CAACA,EAGjCR,EAAkB,CAChB,KAAOQ,GAAc,KAAdA,EAAkB,OACzB,MAAON,GAAK,KAALA,EAAS,OAChB,SAAUO,GAAQ,KAARA,EAAY,YAEnB,CAEL,IAAIG,EACAF,GAAQG,GAAO,0BAIjBD,EAAU,OAAO,OAAOJ,CAAc,EACtCI,EAAQ,YAAc,UAAA,CAAM,OAAAF,EAAK,YAAW,CAAhB,EAC5BV,EAAkB,CAChB,KAAMQ,EAAe,MAAQZ,GAAKY,EAAe,KAAMI,CAAO,EAC9D,MAAOJ,EAAe,OAASZ,GAAKY,EAAe,MAAOI,CAAO,EACjE,SAAUJ,EAAe,UAAYZ,GAAKY,EAAe,SAAUI,CAAO,IAI5EZ,EAAkBQ,EAMtB,OAAAE,EAAK,YAAc,IAAIX,GAAiBC,CAAe,GACzD,CACF,OAAAK,CAAA,EAzCuCS,EAAU,EA2CjD,SAASC,GAAqBC,EAAU,CAClCC,GAAO,sCACTC,GAAaF,CAAK,EAIlBG,GAAqBH,CAAK,CAE9B,CAQA,SAASI,GAAoBC,EAAQ,CACnC,MAAMA,CACR,CAOA,SAASC,GAA0BC,EAA2CC,EAA2B,CAC/F,IAAAC,EAA0BR,GAAM,sBACxCQ,GAAyBC,GAAgB,WAAW,UAAA,CAAM,OAAAD,EAAsBF,EAAcC,CAAU,CAA9C,CAA+C,CAC3G,CAOO,IAAMG,GAA6D,CACxE,OAAQ,GACR,KAAMC,GACN,MAAOR,GACP,SAAUQ,IC5QL,IAAMC,GAA+B,UAAA,CAAM,OAAC,OAAO,QAAW,YAAc,OAAO,YAAe,cAAvD,EAAsE,ECoClH,SAAUC,GAAYC,EAAI,CAC9B,OAAOA,CACT,CCiCM,SAAUC,IAAI,SAACC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EACnB,OAAOC,GAAcF,CAAG,CAC1B,CAGM,SAAUE,GAAoBF,EAA+B,CACjE,OAAIA,EAAI,SAAW,EACVG,GAGLH,EAAI,SAAW,EACVA,EAAI,CAAC,EAGP,SAAeI,EAAQ,CAC5B,OAAOJ,EAAI,OAAO,SAACK,EAAWC,EAAuB,CAAK,OAAAA,EAAGD,CAAI,CAAP,EAAUD,CAAY,CAClF,CACF,CC9EA,IAAAG,EAAA,UAAA,CAkBE,SAAAA,EAAYC,EAA6E,CACnFA,IACF,KAAK,WAAaA,EAEtB,CA4BA,OAAAD,EAAA,UAAA,KAAA,SAAQE,EAAyB,CAC/B,IAAMC,EAAa,IAAIH,EACvB,OAAAG,EAAW,OAAS,KACpBA,EAAW,SAAWD,EACfC,CACT,EA6IAH,EAAA,UAAA,UAAA,SACEI,EACAC,EACAC,EAA8B,CAHhC,IAAAC,EAAA,KAKQC,EAAaC,GAAaL,CAAc,EAAIA,EAAiB,IAAIM,GAAeN,EAAgBC,EAAOC,CAAQ,EAErH,OAAAK,GAAa,UAAA,CACL,IAAAC,EAAuBL,EAArBL,EAAQU,EAAA,SAAEC,EAAMD,EAAA,OACxBJ,EAAW,IACTN,EAGIA,EAAS,KAAKM,EAAYK,CAAM,EAChCA,EAIAN,EAAK,WAAWC,CAAU,EAG1BD,EAAK,cAAcC,CAAU,CAAC,CAEtC,CAAC,EAEMA,CACT,EAGUR,EAAA,UAAA,cAAV,SAAwBc,EAAmB,CACzC,GAAI,CACF,OAAO,KAAK,WAAWA,CAAI,QACpBC,EAAK,CAIZD,EAAK,MAAMC,CAAG,EAElB,EA6DAf,EAAA,UAAA,QAAA,SAAQgB,EAA0BC,EAAoC,CAAtE,IAAAV,EAAA,KACE,OAAAU,EAAcC,GAAeD,CAAW,EAEjC,IAAIA,EAAkB,SAACE,EAASC,EAAM,CAC3C,IAAMZ,EAAa,IAAIE,GAAkB,CACvC,KAAM,SAACW,EAAK,CACV,GAAI,CACFL,EAAKK,CAAK,QACHN,EAAK,CACZK,EAAOL,CAAG,EACVP,EAAW,YAAW,EAE1B,EACA,MAAOY,EACP,SAAUD,EACX,EACDZ,EAAK,UAAUC,CAAU,CAC3B,CAAC,CACH,EAGUR,EAAA,UAAA,WAAV,SAAqBQ,EAA2B,OAC9C,OAAOI,EAAA,KAAK,UAAM,MAAAA,IAAA,OAAA,OAAAA,EAAE,UAAUJ,CAAU,CAC1C,EAOAR,EAAA,UAACG,EAAiB,EAAlB,UAAA,CACE,OAAO,IACT,EA4FAH,EAAA,UAAA,KAAA,UAAA,SAAKsB,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EACH,OAAOC,GAAcF,CAAU,EAAE,IAAI,CACvC,EA6BAtB,EAAA,UAAA,UAAA,SAAUiB,EAAoC,CAA9C,IAAAV,EAAA,KACE,OAAAU,EAAcC,GAAeD,CAAW,EAEjC,IAAIA,EAAY,SAACE,EAASC,EAAM,CACrC,IAAIC,EACJd,EAAK,UACH,SAACkB,EAAI,CAAK,OAACJ,EAAQI,CAAT,EACV,SAACV,EAAQ,CAAK,OAAAK,EAAOL,CAAG,CAAV,EACd,UAAA,CAAM,OAAAI,EAAQE,CAAK,CAAb,CAAc,CAExB,CAAC,CACH,EA1aOrB,EAAA,OAAkC,SAAIC,EAAwD,CACnG,OAAO,IAAID,EAAcC,CAAS,CACpC,EAyaFD,GA9cA,EAudA,SAAS0B,GAAeC,EAA+C,OACrE,OAAOC,EAAAD,GAAW,KAAXA,EAAeE,GAAO,WAAO,MAAAD,IAAA,OAAAA,EAAI,OAC1C,CAEA,SAASE,GAAcC,EAAU,CAC/B,OAAOA,GAASC,EAAWD,EAAM,IAAI,GAAKC,EAAWD,EAAM,KAAK,GAAKC,EAAWD,EAAM,QAAQ,CAChG,CAEA,SAASE,GAAgBF,EAAU,CACjC,OAAQA,GAASA,aAAiBG,IAAgBJ,GAAWC,CAAK,GAAKI,GAAeJ,CAAK,CAC7F,CCzeM,SAAUK,GAAQC,EAAW,CACjC,OAAOC,EAAWD,GAAM,KAAA,OAANA,EAAQ,IAAI,CAChC,CAMM,SAAUE,EACdC,EAAqF,CAErF,OAAO,SAACH,EAAqB,CAC3B,GAAID,GAAQC,CAAM,EAChB,OAAOA,EAAO,KAAK,SAA+BI,EAA2B,CAC3E,GAAI,CACF,OAAOD,EAAKC,EAAc,IAAI,QACvBC,EAAK,CACZ,KAAK,MAAMA,CAAG,EAElB,CAAC,EAEH,MAAM,IAAI,UAAU,wCAAwC,CAC9D,CACF,CCjBM,SAAUC,EACdC,EACAC,EACAC,EACAC,EACAC,EAAuB,CAEvB,OAAO,IAAIC,GAAmBL,EAAaC,EAAQC,EAAYC,EAASC,CAAU,CACpF,CAMA,IAAAC,GAAA,SAAAC,EAAA,CAA2CC,GAAAF,EAAAC,CAAA,EAiBzC,SAAAD,EACEL,EACAC,EACAC,EACAC,EACQC,EACAI,EAAiC,CAN3C,IAAAC,EAoBEH,EAAA,KAAA,KAAMN,CAAW,GAAC,KAfV,OAAAS,EAAA,WAAAL,EACAK,EAAA,kBAAAD,EAeRC,EAAK,MAAQR,EACT,SAAuCS,EAAQ,CAC7C,GAAI,CACFT,EAAOS,CAAK,QACLC,EAAK,CACZX,EAAY,MAAMW,CAAG,EAEzB,EACAL,EAAA,UAAM,MACVG,EAAK,OAASN,EACV,SAAuCQ,EAAQ,CAC7C,GAAI,CACFR,EAAQQ,CAAG,QACJA,EAAK,CAEZX,EAAY,MAAMW,CAAG,UAGrB,KAAK,YAAW,EAEpB,EACAL,EAAA,UAAM,OACVG,EAAK,UAAYP,EACb,UAAA,CACE,GAAI,CACFA,EAAU,QACHS,EAAK,CAEZX,EAAY,MAAMW,CAAG,UAGrB,KAAK,YAAW,EAEpB,EACAL,EAAA,UAAM,WACZ,CAEA,OAAAD,EAAA,UAAA,YAAA,UAAA,OACE,GAAI,CAAC,KAAK,mBAAqB,KAAK,kBAAiB,EAAI,CAC/C,IAAAO,EAAW,KAAI,OACvBN,EAAA,UAAM,YAAW,KAAA,IAAA,EAEjB,CAACM,KAAUC,EAAA,KAAK,cAAU,MAAAA,IAAA,QAAAA,EAAA,KAAf,IAAI,GAEnB,EACFR,CAAA,EAnF2CS,EAAU,ECd9C,IAAMC,GAAiD,CAG5D,SAAA,SAASC,EAAQ,CACf,IAAIC,EAAU,sBACVC,EAAkD,qBAC9CC,EAAaJ,GAAsB,SACvCI,IACFF,EAAUE,EAAS,sBACnBD,EAASC,EAAS,sBAEpB,IAAMC,EAASH,EAAQ,SAACI,EAAS,CAI/BH,EAAS,OACTF,EAASK,CAAS,CACpB,CAAC,EACD,OAAO,IAAIC,GAAa,UAAA,CAAM,OAAAJ,GAAM,KAAA,OAANA,EAASE,CAAM,CAAf,CAAgB,CAChD,EACA,sBAAqB,UAAA,SAACG,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EACZ,IAAAL,EAAaJ,GAAsB,SAC3C,QAAQI,GAAQ,KAAA,OAARA,EAAU,wBAAyB,uBAAsB,MAAA,OAAAM,EAAA,CAAA,EAAAC,EAAIH,CAAI,CAAA,CAAA,CAC3E,EACA,qBAAoB,UAAA,SAACA,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EACX,IAAAL,EAAaJ,GAAsB,SAC3C,QAAQI,GAAQ,KAAA,OAARA,EAAU,uBAAwB,sBAAqB,MAAA,OAAAM,EAAA,CAAA,EAAAC,EAAIH,CAAI,CAAA,CAAA,CACzE,EACA,SAAU,QCrBL,IAAMI,GAAuDC,GAClE,SAACC,EAAM,CACL,OAAA,UAAoC,CAClCA,EAAO,IAAI,EACX,KAAK,KAAO,0BACZ,KAAK,QAAU,qBACjB,CAJA,CAIC,ECXL,IAAAC,EAAA,SAAAC,EAAA,CAAgCC,GAAAF,EAAAC,CAAA,EAwB9B,SAAAD,GAAA,CAAA,IAAAG,EAEEF,EAAA,KAAA,IAAA,GAAO,KAzBT,OAAAE,EAAA,OAAS,GAEDA,EAAA,iBAAyC,KAGjDA,EAAA,UAA2B,CAAA,EAE3BA,EAAA,UAAY,GAEZA,EAAA,SAAW,GAEXA,EAAA,YAAmB,MAenB,CAGA,OAAAH,EAAA,UAAA,KAAA,SAAQI,EAAwB,CAC9B,IAAMC,EAAU,IAAIC,GAAiB,KAAM,IAAI,EAC/C,OAAAD,EAAQ,SAAWD,EACZC,CACT,EAGUL,EAAA,UAAA,eAAV,UAAA,CACE,GAAI,KAAK,OACP,MAAM,IAAIO,EAEd,EAEAP,EAAA,UAAA,KAAA,SAAKQ,EAAQ,CAAb,IAAAL,EAAA,KACEM,GAAa,UAAA,SAEX,GADAN,EAAK,eAAc,EACf,CAACA,EAAK,UAAW,CACdA,EAAK,mBACRA,EAAK,iBAAmB,MAAM,KAAKA,EAAK,SAAS,OAEnD,QAAuBO,EAAAC,GAAAR,EAAK,gBAAgB,EAAAS,EAAAF,EAAA,KAAA,EAAA,CAAAE,EAAA,KAAAA,EAAAF,EAAA,KAAA,EAAE,CAAzC,IAAMG,EAAQD,EAAA,MACjBC,EAAS,KAAKL,CAAK,qGAGzB,CAAC,CACH,EAEAR,EAAA,UAAA,MAAA,SAAMc,EAAQ,CAAd,IAAAX,EAAA,KACEM,GAAa,UAAA,CAEX,GADAN,EAAK,eAAc,EACf,CAACA,EAAK,UAAW,CACnBA,EAAK,SAAWA,EAAK,UAAY,GACjCA,EAAK,YAAcW,EAEnB,QADQC,EAAcZ,EAAI,UACnBY,EAAU,QACfA,EAAU,MAAK,EAAI,MAAMD,CAAG,EAGlC,CAAC,CACH,EAEAd,EAAA,UAAA,SAAA,UAAA,CAAA,IAAAG,EAAA,KACEM,GAAa,UAAA,CAEX,GADAN,EAAK,eAAc,EACf,CAACA,EAAK,UAAW,CACnBA,EAAK,UAAY,GAEjB,QADQY,EAAcZ,EAAI,UACnBY,EAAU,QACfA,EAAU,MAAK,EAAI,SAAQ,EAGjC,CAAC,CACH,EAEAf,EAAA,UAAA,YAAA,UAAA,CACE,KAAK,UAAY,KAAK,OAAS,GAC/B,KAAK,UAAY,KAAK,iBAAmB,IAC3C,EAEA,OAAA,eAAIA,EAAA,UAAA,WAAQ,KAAZ,UAAA,OACE,QAAOgB,EAAA,KAAK,aAAS,MAAAA,IAAA,OAAA,OAAAA,EAAE,QAAS,CAClC,kCAGUhB,EAAA,UAAA,cAAV,SAAwBiB,EAAyB,CAC/C,YAAK,eAAc,EACZhB,EAAA,UAAM,cAAa,KAAA,KAACgB,CAAU,CACvC,EAGUjB,EAAA,UAAA,WAAV,SAAqBiB,EAAyB,CAC5C,YAAK,eAAc,EACnB,KAAK,wBAAwBA,CAAU,EAChC,KAAK,gBAAgBA,CAAU,CACxC,EAGUjB,EAAA,UAAA,gBAAV,SAA0BiB,EAA2B,CAArD,IAAAd,EAAA,KACQa,EAAqC,KAAnCE,EAAQF,EAAA,SAAEG,EAASH,EAAA,UAAED,EAASC,EAAA,UACtC,OAAIE,GAAYC,EACPC,IAET,KAAK,iBAAmB,KACxBL,EAAU,KAAKE,CAAU,EAClB,IAAII,GAAa,UAAA,CACtBlB,EAAK,iBAAmB,KACxBmB,GAAUP,EAAWE,CAAU,CACjC,CAAC,EACH,EAGUjB,EAAA,UAAA,wBAAV,SAAkCiB,EAA2B,CACrD,IAAAD,EAAuC,KAArCE,EAAQF,EAAA,SAAEO,EAAWP,EAAA,YAAEG,EAASH,EAAA,UACpCE,EACFD,EAAW,MAAMM,CAAW,EACnBJ,GACTF,EAAW,SAAQ,CAEvB,EAQAjB,EAAA,UAAA,aAAA,UAAA,CACE,IAAMwB,EAAkB,IAAIC,EAC5B,OAAAD,EAAW,OAAS,KACbA,CACT,EAxHOxB,EAAA,OAAkC,SAAI0B,EAA0BC,EAAqB,CAC1F,OAAO,IAAIrB,GAAoBoB,EAAaC,CAAM,CACpD,EAuHF3B,GA7IgCyB,CAAU,EAkJ1C,IAAAG,GAAA,SAAAC,EAAA,CAAyCC,GAAAF,EAAAC,CAAA,EACvC,SAAAD,EAESG,EACPC,EAAsB,CAHxB,IAAAC,EAKEJ,EAAA,KAAA,IAAA,GAAO,KAHA,OAAAI,EAAA,YAAAF,EAIPE,EAAK,OAASD,GAChB,CAEA,OAAAJ,EAAA,UAAA,KAAA,SAAKM,EAAQ,UACXC,GAAAC,EAAA,KAAK,eAAW,MAAAA,IAAA,OAAA,OAAAA,EAAE,QAAI,MAAAD,IAAA,QAAAA,EAAA,KAAAC,EAAGF,CAAK,CAChC,EAEAN,EAAA,UAAA,MAAA,SAAMS,EAAQ,UACZF,GAAAC,EAAA,KAAK,eAAW,MAAAA,IAAA,OAAA,OAAAA,EAAE,SAAK,MAAAD,IAAA,QAAAA,EAAA,KAAAC,EAAGC,CAAG,CAC/B,EAEAT,EAAA,UAAA,SAAA,UAAA,UACEO,GAAAC,EAAA,KAAK,eAAW,MAAAA,IAAA,OAAA,OAAAA,EAAE,YAAQ,MAAAD,IAAA,QAAAA,EAAA,KAAAC,CAAA,CAC5B,EAGUR,EAAA,UAAA,WAAV,SAAqBU,EAAyB,SAC5C,OAAOH,GAAAC,EAAA,KAAK,UAAM,MAAAA,IAAA,OAAA,OAAAA,EAAE,UAAUE,CAAU,KAAC,MAAAH,IAAA,OAAAA,EAAII,EAC/C,EACFX,CAAA,EA1ByCY,CAAO,ECxJhD,IAAAC,GAAA,SAAAC,EAAA,CAAwCC,GAAAF,EAAAC,CAAA,EACtC,SAAAD,EAAoBG,EAAS,CAA7B,IAAAC,EACEH,EAAA,KAAA,IAAA,GAAO,KADW,OAAAG,EAAA,OAAAD,GAEpB,CAEA,cAAA,eAAIH,EAAA,UAAA,QAAK,KAAT,UAAA,CACE,OAAO,KAAK,SAAQ,CACtB,kCAGUA,EAAA,UAAA,WAAV,SAAqBK,EAAyB,CAC5C,IAAMC,EAAeL,EAAA,UAAM,WAAU,KAAA,KAACI,CAAU,EAChD,OAACC,EAAa,QAAUD,EAAW,KAAK,KAAK,MAAM,EAC5CC,CACT,EAEAN,EAAA,UAAA,SAAA,UAAA,CACQ,IAAAO,EAAoC,KAAlCC,EAAQD,EAAA,SAAEE,EAAWF,EAAA,YAAEJ,EAAMI,EAAA,OACrC,GAAIC,EACF,MAAMC,EAER,YAAK,eAAc,EACZN,CACT,EAEAH,EAAA,UAAA,KAAA,SAAKU,EAAQ,CACXT,EAAA,UAAM,KAAI,KAAA,KAAE,KAAK,OAASS,CAAM,CAClC,EACFV,CAAA,EA5BwCW,CAAO,ECJxC,IAAMC,GAA+C,CAC1D,IAAG,UAAA,CAGD,OAAQA,GAAsB,UAAY,MAAM,IAAG,CACrD,EACA,SAAU,QCwBZ,IAAAC,GAAA,SAAAC,EAAA,CAAsCC,GAAAF,EAAAC,CAAA,EAUpC,SAAAD,EACUG,EACAC,EACAC,EAA6D,CAF7DF,IAAA,SAAAA,EAAA,KACAC,IAAA,SAAAA,EAAA,KACAC,IAAA,SAAAA,EAAAC,IAHV,IAAAC,EAKEN,EAAA,KAAA,IAAA,GAAO,KAJC,OAAAM,EAAA,YAAAJ,EACAI,EAAA,YAAAH,EACAG,EAAA,mBAAAF,EAZFE,EAAA,QAA0B,CAAA,EAC1BA,EAAA,oBAAsB,GAc5BA,EAAK,oBAAsBH,IAAgB,IAC3CG,EAAK,YAAc,KAAK,IAAI,EAAGJ,CAAW,EAC1CI,EAAK,YAAc,KAAK,IAAI,EAAGH,CAAW,GAC5C,CAEA,OAAAJ,EAAA,UAAA,KAAA,SAAKQ,EAAQ,CACL,IAAAC,EAA+E,KAA7EC,EAASD,EAAA,UAAEE,EAAOF,EAAA,QAAEG,EAAmBH,EAAA,oBAAEJ,EAAkBI,EAAA,mBAAEL,EAAWK,EAAA,YAC3EC,IACHC,EAAQ,KAAKH,CAAK,EAClB,CAACI,GAAuBD,EAAQ,KAAKN,EAAmB,IAAG,EAAKD,CAAW,GAE7E,KAAK,YAAW,EAChBH,EAAA,UAAM,KAAI,KAAA,KAACO,CAAK,CAClB,EAGUR,EAAA,UAAA,WAAV,SAAqBa,EAAyB,CAC5C,KAAK,eAAc,EACnB,KAAK,YAAW,EAQhB,QANMC,EAAe,KAAK,gBAAgBD,CAAU,EAE9CJ,EAAmC,KAAjCG,EAAmBH,EAAA,oBAAEE,EAAOF,EAAA,QAG9BM,EAAOJ,EAAQ,MAAK,EACjBK,EAAI,EAAGA,EAAID,EAAK,QAAU,CAACF,EAAW,OAAQG,GAAKJ,EAAsB,EAAI,EACpFC,EAAW,KAAKE,EAAKC,CAAC,CAAM,EAG9B,YAAK,wBAAwBH,CAAU,EAEhCC,CACT,EAEQd,EAAA,UAAA,YAAR,UAAA,CACQ,IAAAS,EAAoE,KAAlEN,EAAWM,EAAA,YAAEJ,EAAkBI,EAAA,mBAAEE,EAAOF,EAAA,QAAEG,EAAmBH,EAAA,oBAK/DQ,GAAsBL,EAAsB,EAAI,GAAKT,EAK3D,GAJAA,EAAc,KAAYc,EAAqBN,EAAQ,QAAUA,EAAQ,OAAO,EAAGA,EAAQ,OAASM,CAAkB,EAIlH,CAACL,EAAqB,CAKxB,QAJMM,EAAMb,EAAmB,IAAG,EAC9Bc,EAAO,EAGFH,EAAI,EAAGA,EAAIL,EAAQ,QAAWA,EAAQK,CAAC,GAAgBE,EAAKF,GAAK,EACxEG,EAAOH,EAETG,GAAQR,EAAQ,OAAO,EAAGQ,EAAO,CAAC,EAEtC,EACFnB,CAAA,EAzEsCoB,CAAO,EClB7C,IAAAC,GAAA,SAAAC,EAAA,CAA+BC,GAAAF,EAAAC,CAAA,EAC7B,SAAAD,EAAYG,EAAsBC,EAAmD,QACnFH,EAAA,KAAA,IAAA,GAAO,IACT,CAWO,OAAAD,EAAA,UAAA,SAAP,SAAgBK,EAAWC,EAAiB,CAAjB,OAAAA,IAAA,SAAAA,EAAA,GAClB,IACT,EACFN,CAAA,EAjB+BO,EAAY,ECHpC,IAAMC,GAAqC,CAGhD,YAAA,SAAYC,EAAqBC,EAAgB,SAAEC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,EAAA,CAAA,EAAA,UAAAA,CAAA,EACzC,IAAAC,EAAaL,GAAgB,SACrC,OAAIK,GAAQ,MAARA,EAAU,YACLA,EAAS,YAAW,MAApBA,EAAQC,EAAA,CAAaL,EAASC,CAAO,EAAAK,EAAKJ,CAAI,CAAA,CAAA,EAEhD,YAAW,MAAA,OAAAG,EAAA,CAACL,EAASC,CAAO,EAAAK,EAAKJ,CAAI,CAAA,CAAA,CAC9C,EACA,cAAA,SAAcK,EAAM,CACV,IAAAH,EAAaL,GAAgB,SACrC,QAAQK,GAAQ,KAAA,OAARA,EAAU,gBAAiB,eAAeG,CAAa,CACjE,EACA,SAAU,QCrBZ,IAAAC,GAAA,SAAAC,EAAA,CAAoCC,GAAAF,EAAAC,CAAA,EAOlC,SAAAD,EAAsBG,EAAqCC,EAAmD,CAA9G,IAAAC,EACEJ,EAAA,KAAA,KAAME,EAAWC,CAAI,GAAC,KADF,OAAAC,EAAA,UAAAF,EAAqCE,EAAA,KAAAD,EAFjDC,EAAA,QAAmB,IAI7B,CAEO,OAAAL,EAAA,UAAA,SAAP,SAAgBM,EAAWC,EAAiB,OAC1C,GADyBA,IAAA,SAAAA,EAAA,GACrB,KAAK,OACP,OAAO,KAIT,KAAK,MAAQD,EAEb,IAAME,EAAK,KAAK,GACVL,EAAY,KAAK,UAuBvB,OAAIK,GAAM,OACR,KAAK,GAAK,KAAK,eAAeL,EAAWK,EAAID,CAAK,GAKpD,KAAK,QAAU,GAEf,KAAK,MAAQA,EAEb,KAAK,IAAKE,EAAA,KAAK,MAAE,MAAAA,IAAA,OAAAA,EAAI,KAAK,eAAeN,EAAW,KAAK,GAAII,CAAK,EAE3D,IACT,EAEUP,EAAA,UAAA,eAAV,SAAyBG,EAA2BO,EAAmBH,EAAiB,CAAjB,OAAAA,IAAA,SAAAA,EAAA,GAC9DI,GAAiB,YAAYR,EAAU,MAAM,KAAKA,EAAW,IAAI,EAAGI,CAAK,CAClF,EAEUP,EAAA,UAAA,eAAV,SAAyBY,EAA4BJ,EAAkBD,EAAwB,CAE7F,GAFqEA,IAAA,SAAAA,EAAA,GAEjEA,GAAS,MAAQ,KAAK,QAAUA,GAAS,KAAK,UAAY,GAC5D,OAAOC,EAILA,GAAM,MACRG,GAAiB,cAAcH,CAAE,CAIrC,EAMOR,EAAA,UAAA,QAAP,SAAeM,EAAUC,EAAa,CACpC,GAAI,KAAK,OACP,OAAO,IAAI,MAAM,8BAA8B,EAGjD,KAAK,QAAU,GACf,IAAMM,EAAQ,KAAK,SAASP,EAAOC,CAAK,EACxC,GAAIM,EACF,OAAOA,EACE,KAAK,UAAY,IAAS,KAAK,IAAM,OAc9C,KAAK,GAAK,KAAK,eAAe,KAAK,UAAW,KAAK,GAAI,IAAI,EAE/D,EAEUb,EAAA,UAAA,SAAV,SAAmBM,EAAUQ,EAAc,CACzC,IAAIC,EAAmB,GACnBC,EACJ,GAAI,CACF,KAAK,KAAKV,CAAK,QACRW,EAAG,CACVF,EAAU,GAIVC,EAAaC,GAAQ,IAAI,MAAM,oCAAoC,EAErE,GAAIF,EACF,YAAK,YAAW,EACTC,CAEX,EAEAhB,EAAA,UAAA,YAAA,UAAA,CACE,GAAI,CAAC,KAAK,OAAQ,CACV,IAAAS,EAAoB,KAAlBD,EAAEC,EAAA,GAAEN,EAASM,EAAA,UACbS,EAAYf,EAAS,QAE7B,KAAK,KAAO,KAAK,MAAQ,KAAK,UAAY,KAC1C,KAAK,QAAU,GAEfgB,GAAUD,EAAS,IAAI,EACnBV,GAAM,OACR,KAAK,GAAK,KAAK,eAAeL,EAAWK,EAAI,IAAI,GAGnD,KAAK,MAAQ,KACbP,EAAA,UAAM,YAAW,KAAA,IAAA,EAErB,EACFD,CAAA,EA9IoCoB,EAAM,ECgB1C,IAAAC,GAAA,UAAA,CAGE,SAAAA,EAAoBC,EAAoCC,EAAiC,CAAjCA,IAAA,SAAAA,EAAoBF,EAAU,KAAlE,KAAA,oBAAAC,EAClB,KAAK,IAAMC,CACb,CA6BO,OAAAF,EAAA,UAAA,SAAP,SAAmBG,EAAqDC,EAAmBC,EAAS,CAA5B,OAAAD,IAAA,SAAAA,EAAA,GAC/D,IAAI,KAAK,oBAAuB,KAAMD,CAAI,EAAE,SAASE,EAAOD,CAAK,CAC1E,EAnCcJ,EAAA,IAAoBM,GAAsB,IAoC1DN,GArCA,ECnBA,IAAAO,GAAA,SAAAC,EAAA,CAAoCC,GAAAF,EAAAC,CAAA,EAkBlC,SAAAD,EAAYG,EAAgCC,EAAiC,CAAjCA,IAAA,SAAAA,EAAoBC,GAAU,KAA1E,IAAAC,EACEL,EAAA,KAAA,KAAME,EAAiBC,CAAG,GAAC,KAlBtB,OAAAE,EAAA,QAAmC,CAAA,EAOnCA,EAAA,QAAmB,IAY1B,CAEO,OAAAN,EAAA,UAAA,MAAP,SAAaO,EAAwB,CAC3B,IAAAC,EAAY,KAAI,QAExB,GAAI,KAAK,QAAS,CAChBA,EAAQ,KAAKD,CAAM,EACnB,OAGF,IAAIE,EACJ,KAAK,QAAU,GAEf,EACE,IAAKA,EAAQF,EAAO,QAAQA,EAAO,MAAOA,EAAO,KAAK,EACpD,YAEMA,EAASC,EAAQ,MAAK,GAIhC,GAFA,KAAK,QAAU,GAEXC,EAAO,CACT,KAAQF,EAASC,EAAQ,MAAK,GAC5BD,EAAO,YAAW,EAEpB,MAAME,EAEV,EACFT,CAAA,EAhDoCK,EAAS,EC6CtC,IAAMK,GAAiB,IAAIC,GAAeC,EAAW,EAK/CC,GAAQH,GCjDrB,IAAAI,GAAA,SAAAC,EAAA,CAAoCC,GAAAF,EAAAC,CAAA,EAClC,SAAAD,EAAsBG,EAAqCC,EAAmD,CAA9G,IAAAC,EACEJ,EAAA,KAAA,KAAME,EAAWC,CAAI,GAAC,KADF,OAAAC,EAAA,UAAAF,EAAqCE,EAAA,KAAAD,GAE3D,CAEO,OAAAJ,EAAA,UAAA,SAAP,SAAgBM,EAAWC,EAAiB,CAC1C,OADyBA,IAAA,SAAAA,EAAA,GACrBA,EAAQ,EACHN,EAAA,UAAM,SAAQ,KAAA,KAACK,EAAOC,CAAK,GAEpC,KAAK,MAAQA,EACb,KAAK,MAAQD,EACb,KAAK,UAAU,MAAM,IAAI,EAClB,KACT,EAEON,EAAA,UAAA,QAAP,SAAeM,EAAUC,EAAa,CACpC,OAAOA,EAAQ,GAAK,KAAK,OAASN,EAAA,UAAM,QAAO,KAAA,KAACK,EAAOC,CAAK,EAAI,KAAK,SAASD,EAAOC,CAAK,CAC5F,EAEUP,EAAA,UAAA,eAAV,SAAyBG,EAA2BK,EAAkBD,EAAiB,CAKrF,OALoEA,IAAA,SAAAA,EAAA,GAK/DA,GAAS,MAAQA,EAAQ,GAAOA,GAAS,MAAQ,KAAK,MAAQ,EAC1DN,EAAA,UAAM,eAAc,KAAA,KAACE,EAAWK,EAAID,CAAK,GAIlDJ,EAAU,MAAM,IAAI,EAMb,EACT,EACFH,CAAA,EArCoCS,EAAW,ECJ/C,IAAAC,GAAA,SAAAC,EAAA,CAAoCC,GAAAF,EAAAC,CAAA,EAApC,SAAAD,GAAA,+CACA,CAAA,OAAAA,CAAA,EADoCG,EAAc,ECgE3C,IAAMC,GAAiB,IAAIC,GAAeC,EAAW,EC5D5D,IAAAC,GAAA,SAAAC,EAAA,CAA6CC,GAAAF,EAAAC,CAAA,EAC3C,SAAAD,EAAsBG,EAA8CC,EAAmD,CAAvH,IAAAC,EACEJ,EAAA,KAAA,KAAME,EAAWC,CAAI,GAAC,KADF,OAAAC,EAAA,UAAAF,EAA8CE,EAAA,KAAAD,GAEpE,CAEU,OAAAJ,EAAA,UAAA,eAAV,SAAyBG,EAAoCG,EAAkBC,EAAiB,CAE9F,OAF6EA,IAAA,SAAAA,EAAA,GAEzEA,IAAU,MAAQA,EAAQ,EACrBN,EAAA,UAAM,eAAc,KAAA,KAACE,EAAWG,EAAIC,CAAK,GAGlDJ,EAAU,QAAQ,KAAK,IAAI,EAIpBA,EAAU,aAAeA,EAAU,WAAaK,GAAuB,sBAAsB,UAAA,CAAM,OAAAL,EAAU,MAAM,MAAS,CAAzB,CAA0B,GACtI,EAEUH,EAAA,UAAA,eAAV,SAAyBG,EAAoCG,EAAkBC,EAAiB,OAI9F,GAJ6EA,IAAA,SAAAA,EAAA,GAIzEA,GAAS,KAAOA,EAAQ,EAAI,KAAK,MAAQ,EAC3C,OAAON,EAAA,UAAM,eAAc,KAAA,KAACE,EAAWG,EAAIC,CAAK,EAK1C,IAAAE,EAAYN,EAAS,QACzBG,GAAM,QAAQI,EAAAD,EAAQA,EAAQ,OAAS,CAAC,KAAC,MAAAC,IAAA,OAAA,OAAAA,EAAE,MAAOJ,IACpDE,GAAuB,qBAAqBF,CAAY,EACxDH,EAAU,WAAa,OAI3B,EACFH,CAAA,EApC6CW,EAAW,ECHxD,IAAAC,GAAA,SAAAC,EAAA,CAA6CC,GAAAF,EAAAC,CAAA,EAA7C,SAAAD,GAAA,+CAkCA,CAjCS,OAAAA,EAAA,UAAA,MAAP,SAAaG,EAAyB,CACpC,KAAK,QAAU,GAUf,IAAMC,EAAU,KAAK,WACrB,KAAK,WAAa,OAEV,IAAAC,EAAY,KAAI,QACpBC,EACJH,EAASA,GAAUE,EAAQ,MAAK,EAEhC,EACE,IAAKC,EAAQH,EAAO,QAAQA,EAAO,MAAOA,EAAO,KAAK,EACpD,aAEMA,EAASE,EAAQ,CAAC,IAAMF,EAAO,KAAOC,GAAWC,EAAQ,MAAK,GAIxE,GAFA,KAAK,QAAU,GAEXC,EAAO,CACT,MAAQH,EAASE,EAAQ,CAAC,IAAMF,EAAO,KAAOC,GAAWC,EAAQ,MAAK,GACpEF,EAAO,YAAW,EAEpB,MAAMG,EAEV,EACFN,CAAA,EAlC6CO,EAAc,ECgCpD,IAAMC,GAA0B,IAAIC,GAAwBC,EAAoB,EC8BhF,IAAMC,EAAQ,IAAIC,EAAkB,SAACC,EAAU,CAAK,OAAAA,EAAW,SAAQ,CAAnB,CAAqB,EC9D1E,SAAUC,GAAYC,EAAU,CACpC,OAAOA,GAASC,EAAWD,EAAM,QAAQ,CAC3C,CCDA,SAASE,GAAQC,EAAQ,CACvB,OAAOA,EAAIA,EAAI,OAAS,CAAC,CAC3B,CAEM,SAAUC,GAAkBC,EAAW,CAC3C,OAAOC,EAAWJ,GAAKG,CAAI,CAAC,EAAIA,EAAK,IAAG,EAAK,MAC/C,CAEM,SAAUE,GAAaF,EAAW,CACtC,OAAOG,GAAYN,GAAKG,CAAI,CAAC,EAAIA,EAAK,IAAG,EAAK,MAChD,CAEM,SAAUI,GAAUJ,EAAaK,EAAoB,CACzD,OAAO,OAAOR,GAAKG,CAAI,GAAM,SAAWA,EAAK,IAAG,EAAMK,CACxD,CClBO,IAAMC,GAAe,SAAIC,EAAM,CAAwB,OAAAA,GAAK,OAAOA,EAAE,QAAW,UAAY,OAAOA,GAAM,UAAlD,ECMxD,SAAUC,GAAUC,EAAU,CAClC,OAAOC,EAAWD,GAAK,KAAA,OAALA,EAAO,IAAI,CAC/B,CCHM,SAAUE,GAAoBC,EAAU,CAC5C,OAAOC,EAAWD,EAAME,EAAiB,CAAC,CAC5C,CCLM,SAAUC,GAAmBC,EAAQ,CACzC,OAAO,OAAO,eAAiBC,EAAWD,GAAG,KAAA,OAAHA,EAAM,OAAO,aAAa,CAAC,CACvE,CCAM,SAAUE,GAAiCC,EAAU,CAEzD,OAAO,IAAI,UACT,iBACEA,IAAU,MAAQ,OAAOA,GAAU,SAAW,oBAAsB,IAAIA,EAAK,KAAG,0HACwC,CAE9H,CCXM,SAAUC,IAAiB,CAC/B,OAAI,OAAO,QAAW,YAAc,CAAC,OAAO,SACnC,aAGF,OAAO,QAChB,CAEO,IAAMC,GAAWD,GAAiB,ECJnC,SAAUE,GAAWC,EAAU,CACnC,OAAOC,EAAWD,GAAK,KAAA,OAALA,EAAQE,EAAe,CAAC,CAC5C,CCHM,SAAiBC,GAAsCC,EAAqC,mGAC1FC,EAASD,EAAe,UAAS,2DAGX,MAAA,CAAA,EAAAE,GAAMD,EAAO,KAAI,CAAE,CAAA,gBAArCE,EAAkBC,EAAA,KAAA,EAAhBC,EAAKF,EAAA,MAAEG,EAAIH,EAAA,KACfG,iBAAA,CAAA,EAAA,CAAA,SACF,MAAA,CAAA,EAAAF,EAAA,KAAA,CAAA,qBAEIC,CAAM,CAAA,SAAZ,MAAA,CAAA,EAAAD,EAAA,KAAA,CAAA,SAAA,OAAAA,EAAA,KAAA,mCAGF,OAAAH,EAAO,YAAW,6BAIhB,SAAUM,GAAwBC,EAAQ,CAG9C,OAAOC,EAAWD,GAAG,KAAA,OAAHA,EAAK,SAAS,CAClC,CCPM,SAAUE,EAAaC,EAAyB,CACpD,GAAIA,aAAiBC,EACnB,OAAOD,EAET,GAAIA,GAAS,KAAM,CACjB,GAAIE,GAAoBF,CAAK,EAC3B,OAAOG,GAAsBH,CAAK,EAEpC,GAAII,GAAYJ,CAAK,EACnB,OAAOK,GAAcL,CAAK,EAE5B,GAAIM,GAAUN,CAAK,EACjB,OAAOO,GAAYP,CAAK,EAE1B,GAAIQ,GAAgBR,CAAK,EACvB,OAAOS,GAAkBT,CAAK,EAEhC,GAAIU,GAAWV,CAAK,EAClB,OAAOW,GAAaX,CAAK,EAE3B,GAAIY,GAAqBZ,CAAK,EAC5B,OAAOa,GAAuBb,CAAK,EAIvC,MAAMc,GAAiCd,CAAK,CAC9C,CAMM,SAAUG,GAAyBY,EAAQ,CAC/C,OAAO,IAAId,EAAW,SAACe,EAAyB,CAC9C,IAAMC,EAAMF,EAAIG,EAAiB,EAAC,EAClC,GAAIC,EAAWF,EAAI,SAAS,EAC1B,OAAOA,EAAI,UAAUD,CAAU,EAGjC,MAAM,IAAI,UAAU,gEAAgE,CACtF,CAAC,CACH,CASM,SAAUX,GAAiBe,EAAmB,CAClD,OAAO,IAAInB,EAAW,SAACe,EAAyB,CAU9C,QAASK,EAAI,EAAGA,EAAID,EAAM,QAAU,CAACJ,EAAW,OAAQK,IACtDL,EAAW,KAAKI,EAAMC,CAAC,CAAC,EAE1BL,EAAW,SAAQ,CACrB,CAAC,CACH,CAEM,SAAUT,GAAee,EAAuB,CACpD,OAAO,IAAIrB,EAAW,SAACe,EAAyB,CAC9CM,EACG,KACC,SAACC,EAAK,CACCP,EAAW,SACdA,EAAW,KAAKO,CAAK,EACrBP,EAAW,SAAQ,EAEvB,EACA,SAACQ,EAAQ,CAAK,OAAAR,EAAW,MAAMQ,CAAG,CAApB,CAAqB,EAEpC,KAAK,KAAMC,EAAoB,CACpC,CAAC,CACH,CAEM,SAAUd,GAAgBe,EAAqB,CACnD,OAAO,IAAIzB,EAAW,SAACe,EAAyB,aAC9C,QAAoBW,EAAAC,GAAAF,CAAQ,EAAAG,EAAAF,EAAA,KAAA,EAAA,CAAAE,EAAA,KAAAA,EAAAF,EAAA,KAAA,EAAE,CAAzB,IAAMJ,EAAKM,EAAA,MAEd,GADAb,EAAW,KAAKO,CAAK,EACjBP,EAAW,OACb,yGAGJA,EAAW,SAAQ,CACrB,CAAC,CACH,CAEM,SAAUP,GAAqBqB,EAA+B,CAClE,OAAO,IAAI7B,EAAW,SAACe,EAAyB,CAC9Ce,GAAQD,EAAed,CAAU,EAAE,MAAM,SAACQ,EAAG,CAAK,OAAAR,EAAW,MAAMQ,CAAG,CAApB,CAAqB,CACzE,CAAC,CACH,CAEM,SAAUX,GAA0BmB,EAAqC,CAC7E,OAAOvB,GAAkBwB,GAAmCD,CAAc,CAAC,CAC7E,CAEA,SAAeD,GAAWD,EAAiCd,EAAyB,uIACxDkB,EAAAC,GAAAL,CAAa,gFAIrC,GAJeP,EAAKa,EAAA,MACpBpB,EAAW,KAAKO,CAAK,EAGjBP,EAAW,OACb,MAAA,CAAA,CAAA,6RAGJ,OAAAA,EAAW,SAAQ,WChHf,SAAUqB,GACdC,EACAC,EACAC,EACAC,EACAC,EAAc,CADdD,IAAA,SAAAA,EAAA,GACAC,IAAA,SAAAA,EAAA,IAEA,IAAMC,EAAuBJ,EAAU,SAAS,UAAA,CAC9CC,EAAI,EACAE,EACFJ,EAAmB,IAAI,KAAK,SAAS,KAAMG,CAAK,CAAC,EAEjD,KAAK,YAAW,CAEpB,EAAGA,CAAK,EAIR,GAFAH,EAAmB,IAAIK,CAAoB,EAEvC,CAACD,EAKH,OAAOC,CAEX,CCeM,SAAUC,GAAaC,EAA0BC,EAAS,CAAT,OAAAA,IAAA,SAAAA,EAAA,GAC9CC,EAAQ,SAACC,EAAQC,EAAU,CAChCD,EAAO,UACLE,EACED,EACA,SAACE,EAAK,CAAK,OAAAC,GAAgBH,EAAYJ,EAAW,UAAA,CAAM,OAAAI,EAAW,KAAKE,CAAK,CAArB,EAAwBL,CAAK,CAA1E,EACX,UAAA,CAAM,OAAAM,GAAgBH,EAAYJ,EAAW,UAAA,CAAM,OAAAI,EAAW,SAAQ,CAAnB,EAAuBH,CAAK,CAAzE,EACN,SAACO,EAAG,CAAK,OAAAD,GAAgBH,EAAYJ,EAAW,UAAA,CAAM,OAAAI,EAAW,MAAMI,CAAG,CAApB,EAAuBP,CAAK,CAAzE,CAA0E,CACpF,CAEL,CAAC,CACH,CCPM,SAAUQ,GAAeC,EAA0BC,EAAiB,CAAjB,OAAAA,IAAA,SAAAA,EAAA,GAChDC,EAAQ,SAACC,EAAQC,EAAU,CAChCA,EAAW,IAAIJ,EAAU,SAAS,UAAA,CAAM,OAAAG,EAAO,UAAUC,CAAU,CAA3B,EAA8BH,CAAK,CAAC,CAC9E,CAAC,CACH,CC7DM,SAAUI,GAAsBC,EAA6BC,EAAwB,CACzF,OAAOC,EAAUF,CAAK,EAAE,KAAKG,GAAYF,CAAS,EAAGG,GAAUH,CAAS,CAAC,CAC3E,CCFM,SAAUI,GAAmBC,EAAuBC,EAAwB,CAChF,OAAOC,EAAUF,CAAK,EAAE,KAAKG,GAAYF,CAAS,EAAGG,GAAUH,CAAS,CAAC,CAC3E,CCJM,SAAUI,GAAiBC,EAAqBC,EAAwB,CAC5E,OAAO,IAAIC,EAAc,SAACC,EAAU,CAElC,IAAIC,EAAI,EAER,OAAOH,EAAU,SAAS,UAAA,CACpBG,IAAMJ,EAAM,OAGdG,EAAW,SAAQ,GAInBA,EAAW,KAAKH,EAAMI,GAAG,CAAC,EAIrBD,EAAW,QACd,KAAK,SAAQ,EAGnB,CAAC,CACH,CAAC,CACH,CCfM,SAAUE,GAAoBC,EAAoBC,EAAwB,CAC9E,OAAO,IAAIC,EAAc,SAACC,EAAU,CAClC,IAAIC,EAKJ,OAAAC,GAAgBF,EAAYF,EAAW,UAAA,CAErCG,EAAYJ,EAAcI,EAAe,EAAC,EAE1CC,GACEF,EACAF,EACA,UAAA,OACMK,EACAC,EACJ,GAAI,CAEDC,EAAkBJ,EAAS,KAAI,EAA7BE,EAAKE,EAAA,MAAED,EAAIC,EAAA,WACPC,EAAK,CAEZN,EAAW,MAAMM,CAAG,EACpB,OAGEF,EAKFJ,EAAW,SAAQ,EAGnBA,EAAW,KAAKG,CAAK,CAEzB,EACA,EACA,EAAI,CAER,CAAC,EAMM,UAAA,CAAM,OAAAI,EAAWN,GAAQ,KAAA,OAARA,EAAU,MAAM,GAAKA,EAAS,OAAM,CAA/C,CACf,CAAC,CACH,CCvDM,SAAUO,GAAyBC,EAAyBC,EAAwB,CACxF,GAAI,CAACD,EACH,MAAM,IAAI,MAAM,yBAAyB,EAE3C,OAAO,IAAIE,EAAc,SAACC,EAAU,CAClCC,GAAgBD,EAAYF,EAAW,UAAA,CACrC,IAAMI,EAAWL,EAAM,OAAO,aAAa,EAAC,EAC5CI,GACED,EACAF,EACA,UAAA,CACEI,EAAS,KAAI,EAAG,KAAK,SAACC,EAAM,CACtBA,EAAO,KAGTH,EAAW,SAAQ,EAEnBA,EAAW,KAAKG,EAAO,KAAK,CAEhC,CAAC,CACH,EACA,EACA,EAAI,CAER,CAAC,CACH,CAAC,CACH,CCzBM,SAAUC,GAA8BC,EAA8BC,EAAwB,CAClG,OAAOC,GAAsBC,GAAmCH,CAAK,EAAGC,CAAS,CACnF,CCoBM,SAAUG,GAAaC,EAA2BC,EAAwB,CAC9E,GAAID,GAAS,KAAM,CACjB,GAAIE,GAAoBF,CAAK,EAC3B,OAAOG,GAAmBH,EAAOC,CAAS,EAE5C,GAAIG,GAAYJ,CAAK,EACnB,OAAOK,GAAcL,EAAOC,CAAS,EAEvC,GAAIK,GAAUN,CAAK,EACjB,OAAOO,GAAgBP,EAAOC,CAAS,EAEzC,GAAIO,GAAgBR,CAAK,EACvB,OAAOS,GAAsBT,EAAOC,CAAS,EAE/C,GAAIS,GAAWV,CAAK,EAClB,OAAOW,GAAiBX,EAAOC,CAAS,EAE1C,GAAIW,GAAqBZ,CAAK,EAC5B,OAAOa,GAA2Bb,EAAOC,CAAS,EAGtD,MAAMa,GAAiCd,CAAK,CAC9C,CCoDM,SAAUe,GAAQC,EAA2BC,EAAyB,CAC1E,OAAOA,EAAYC,GAAUF,EAAOC,CAAS,EAAIE,EAAUH,CAAK,CAClE,CCxBM,SAAUI,GAAE,SAAIC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EACpB,IAAMC,EAAYC,GAAaH,CAAI,EACnC,OAAOI,GAAKJ,EAAaE,CAAS,CACpC,CCsCM,SAAUG,GAAWC,EAA0BC,EAAyB,CAC5E,IAAMC,EAAeC,EAAWH,CAAmB,EAAIA,EAAsB,UAAA,CAAM,OAAAA,CAAA,EAC7EI,EAAO,SAACC,EAA6B,CAAK,OAAAA,EAAW,MAAMH,EAAY,CAAE,CAA/B,EAChD,OAAO,IAAII,EAAWL,EAAY,SAACI,EAAU,CAAK,OAAAJ,EAAU,SAASG,EAAa,EAAGC,CAAU,CAA7C,EAAiDD,CAAI,CACzG,CCpGO,IAAMG,GAA6BC,GAAiB,SAACC,EAAM,CAAK,OAAA,UAAuB,CAC5FA,EAAO,IAAI,EACX,KAAK,KAAO,aACZ,KAAK,QAAU,yBACjB,CAJuE,CAItE,ECrBK,SAAUC,GAAYC,EAAU,CACpC,OAAOA,aAAiB,MAAQ,CAAC,MAAMA,CAAY,CACrD,CCsCM,SAAUC,EAAUC,EAAyCC,EAAa,CAC9E,OAAOC,EAAQ,SAACC,EAAQC,EAAU,CAEhC,IAAIC,EAAQ,EAGZF,EAAO,UACLG,EAAyBF,EAAY,SAACG,EAAQ,CAG5CH,EAAW,KAAKJ,EAAQ,KAAKC,EAASM,EAAOF,GAAO,CAAC,CACvD,CAAC,CAAC,CAEN,CAAC,CACH,CC1DQ,IAAAG,GAAY,MAAK,QAEzB,SAASC,GAAkBC,EAA6BC,EAAW,CAC/D,OAAOH,GAAQG,CAAI,EAAID,EAAE,MAAA,OAAAE,EAAA,CAAA,EAAAC,EAAIF,CAAI,CAAA,CAAA,EAAID,EAAGC,CAAI,CAChD,CAMM,SAAUG,GAAuBJ,EAA2B,CAC9D,OAAOK,EAAI,SAAAJ,EAAI,CAAI,OAAAF,GAAYC,EAAIC,CAAI,CAApB,CAAqB,CAC5C,CCfQ,IAAAK,GAAY,MAAK,QACjBC,GAA0D,OAAM,eAArCC,GAA+B,OAAM,UAAlBC,GAAY,OAAM,KAQlE,SAAUC,GAAqDC,EAAuB,CAC1F,GAAIA,EAAK,SAAW,EAAG,CACrB,IAAMC,EAAQD,EAAK,CAAC,EACpB,GAAIL,GAAQM,CAAK,EACf,MAAO,CAAE,KAAMA,EAAO,KAAM,IAAI,EAElC,GAAIC,GAAOD,CAAK,EAAG,CACjB,IAAME,EAAOL,GAAQG,CAAK,EAC1B,MAAO,CACL,KAAME,EAAK,IAAI,SAACC,EAAG,CAAK,OAAAH,EAAMG,CAAG,CAAT,CAAU,EAClC,KAAID,IAKV,MAAO,CAAE,KAAMH,EAAa,KAAM,IAAI,CACxC,CAEA,SAASE,GAAOG,EAAQ,CACtB,OAAOA,GAAO,OAAOA,GAAQ,UAAYT,GAAeS,CAAG,IAAMR,EACnE,CC7BM,SAAUS,GAAaC,EAAgBC,EAAa,CACxD,OAAOD,EAAK,OAAO,SAACE,EAAQC,EAAKC,EAAC,CAAK,OAAEF,EAAOC,CAAG,EAAIF,EAAOG,CAAC,EAAIF,CAA5B,EAAqC,CAAA,CAAS,CACvF,CCsMM,SAAUG,GAAa,SAAoCC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EAC/D,IAAMC,EAAYC,GAAaH,CAAI,EAC7BI,EAAiBC,GAAkBL,CAAI,EAEvCM,EAA8BC,GAAqBP,CAAI,EAA/CQ,EAAWF,EAAA,KAAEG,EAAIH,EAAA,KAE/B,GAAIE,EAAY,SAAW,EAIzB,OAAOE,GAAK,CAAA,EAAIR,CAAgB,EAGlC,IAAMS,EAAS,IAAIC,EACjBC,GACEL,EACAN,EACAO,EAEI,SAACK,EAAM,CAAK,OAAAC,GAAaN,EAAMK,CAAM,CAAzB,EAEZE,EAAQ,CACb,EAGH,OAAOZ,EAAkBO,EAAO,KAAKM,GAAiBb,CAAc,CAAC,EAAsBO,CAC7F,CAEM,SAAUE,GACdL,EACAN,EACAgB,EAAiD,CAAjD,OAAAA,IAAA,SAAAA,EAAAF,IAEO,SAACG,EAA2B,CAGjCC,GACElB,EACA,UAAA,CAaE,QAZQmB,EAAWb,EAAW,OAExBM,EAAS,IAAI,MAAMO,CAAM,EAG3BC,EAASD,EAITE,EAAuBF,aAGlBG,EAAC,CACRJ,GACElB,EACA,UAAA,CACE,IAAMuB,EAASf,GAAKF,EAAYgB,CAAC,EAAGtB,CAAgB,EAChDwB,EAAgB,GACpBD,EAAO,UACLE,EACER,EACA,SAACS,EAAK,CAEJd,EAAOU,CAAC,EAAII,EACPF,IAEHA,EAAgB,GAChBH,KAEGA,GAGHJ,EAAW,KAAKD,EAAeJ,EAAO,MAAK,CAAE,CAAC,CAElD,EACA,UAAA,CACO,EAAEQ,GAGLH,EAAW,SAAQ,CAEvB,CAAC,CACF,CAEL,EACAA,CAAU,GAjCLK,EAAI,EAAGA,EAAIH,EAAQG,MAAnBA,CAAC,CAoCZ,EACAL,CAAU,CAEd,CACF,CAMA,SAASC,GAAclB,EAAsC2B,EAAqBC,EAA0B,CACtG5B,EACF6B,GAAgBD,EAAc5B,EAAW2B,CAAO,EAEhDA,EAAO,CAEX,CC3RM,SAAUG,GACdC,EACAC,EACAC,EACAC,EACAC,EACAC,EACAC,EACAC,EAAgC,CAGhC,IAAMC,EAAc,CAAA,EAEhBC,EAAS,EAETC,EAAQ,EAERC,EAAa,GAKXC,EAAgB,UAAA,CAIhBD,GAAc,CAACH,EAAO,QAAU,CAACC,GACnCR,EAAW,SAAQ,CAEvB,EAGMY,EAAY,SAACC,EAAQ,CAAK,OAACL,EAASN,EAAaY,EAAWD,CAAK,EAAIN,EAAO,KAAKM,CAAK,CAA5D,EAE1BC,EAAa,SAACD,EAAQ,CAI1BT,GAAUJ,EAAW,KAAKa,CAAY,EAItCL,IAKA,IAAIO,GAAgB,GAGpBC,EAAUf,EAAQY,EAAOJ,GAAO,CAAC,EAAE,UACjCQ,EACEjB,EACA,SAACkB,GAAU,CAGTf,GAAY,MAAZA,EAAee,EAAU,EAErBd,EAGFQ,EAAUM,EAAiB,EAG3BlB,EAAW,KAAKkB,EAAU,CAE9B,EACA,UAAA,CAGEH,GAAgB,EAClB,EAEA,OACA,UAAA,CAIE,GAAIA,GAKF,GAAI,CAIFP,IAKA,sBACE,IAAMW,EAAgBZ,EAAO,MAAK,EAI9BF,EACFe,GAAgBpB,EAAYK,EAAmB,UAAA,CAAM,OAAAS,EAAWK,CAAa,CAAxB,CAAyB,EAE9EL,EAAWK,CAAa,GARrBZ,EAAO,QAAUC,EAASN,QAYjCS,EAAa,QACNU,EAAK,CACZrB,EAAW,MAAMqB,CAAG,EAG1B,CAAC,CACF,CAEL,EAGA,OAAAtB,EAAO,UACLkB,EAAyBjB,EAAYY,EAAW,UAAA,CAE9CF,EAAa,GACbC,EAAa,CACf,CAAC,CAAC,EAKG,UAAA,CACLL,GAAmB,MAAnBA,EAAmB,CACrB,CACF,CClEM,SAAUgB,GACdC,EACAC,EACAC,EAA6B,CAE7B,OAFAA,IAAA,SAAAA,EAAA,KAEIC,EAAWF,CAAc,EAEpBF,GAAS,SAACK,EAAGC,EAAC,CAAK,OAAAC,EAAI,SAACC,EAAQC,EAAU,CAAK,OAAAP,EAAeG,EAAGG,EAAGF,EAAGG,CAAE,CAA1B,CAA2B,EAAEC,EAAUT,EAAQI,EAAGC,CAAC,CAAC,CAAC,CAAjF,EAAoFH,CAAU,GAC/G,OAAOD,GAAmB,WACnCC,EAAaD,GAGRS,EAAQ,SAACC,EAAQC,EAAU,CAAK,OAAAC,GAAeF,EAAQC,EAAYZ,EAASE,CAAU,CAAtD,CAAuD,EAChG,CChCM,SAAUY,GAAyCC,EAA6B,CAA7B,OAAAA,IAAA,SAAAA,EAAA,KAChDC,GAASC,GAAUF,CAAU,CACtC,CCNM,SAAUG,IAAS,CACvB,OAAOC,GAAS,CAAC,CACnB,CCmDM,SAAUC,IAAM,SAACC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EACrB,OAAOC,GAAS,EAAGC,GAAKH,EAAMI,GAAaJ,CAAI,CAAC,CAAC,CACnD,CC9DM,SAAUK,EAAsCC,EAA0B,CAC9E,OAAO,IAAIC,EAA+B,SAACC,EAAU,CACnDC,EAAUH,EAAiB,CAAE,EAAE,UAAUE,CAAU,CACrD,CAAC,CACH,CChDA,IAAME,GAA0B,CAAC,cAAe,gBAAgB,EAC1DC,GAAqB,CAAC,mBAAoB,qBAAqB,EAC/DC,GAAgB,CAAC,KAAM,KAAK,EAkO5B,SAAUC,EACdC,EACAC,EACAC,EACAC,EAAsC,CAMtC,GAJIC,EAAWF,CAAO,IACpBC,EAAiBD,EACjBA,EAAU,QAERC,EACF,OAAOJ,EAAaC,EAAQC,EAAWC,CAA+B,EAAE,KAAKG,GAAiBF,CAAc,CAAC,EAUzG,IAAAG,EAAAC,EAEJC,GAAcR,CAAM,EAChBH,GAAmB,IAAI,SAACY,EAAU,CAAK,OAAA,SAACC,EAAY,CAAK,OAAAV,EAAOS,CAAU,EAAER,EAAWS,EAASR,CAA+B,CAAtE,CAAlB,CAAyF,EAElIS,GAAwBX,CAAM,EAC5BJ,GAAwB,IAAIgB,GAAwBZ,EAAQC,CAAS,CAAC,EACtEY,GAA0Bb,CAAM,EAChCF,GAAc,IAAIc,GAAwBZ,EAAQC,CAAS,CAAC,EAC5D,CAAA,EAAE,CAAA,EATDa,EAAGR,EAAA,CAAA,EAAES,EAAMT,EAAA,CAAA,EAgBlB,GAAI,CAACQ,GACCE,GAAYhB,CAAM,EACpB,OAAOiB,GAAS,SAACC,EAAc,CAAK,OAAAnB,EAAUmB,EAAWjB,EAAWC,CAA+B,CAA/D,CAAgE,EAClGiB,EAAUnB,CAAM,CAAC,EAOvB,GAAI,CAACc,EACH,MAAM,IAAI,UAAU,sBAAsB,EAG5C,OAAO,IAAIM,EAAc,SAACC,EAAU,CAIlC,IAAMX,EAAU,UAAA,SAACY,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EAAmB,OAAAF,EAAW,KAAK,EAAIC,EAAK,OAASA,EAAOA,EAAK,CAAC,CAAC,CAAhD,EAEpC,OAAAR,EAAIJ,CAAO,EAEJ,UAAA,CAAM,OAAAK,EAAQL,CAAO,CAAf,CACf,CAAC,CACH,CASA,SAASE,GAAwBZ,EAAaC,EAAiB,CAC7D,OAAO,SAACQ,EAAkB,CAAK,OAAA,SAACC,EAAY,CAAK,OAAAV,EAAOS,CAAU,EAAER,EAAWS,CAAO,CAArC,CAAlB,CACjC,CAOA,SAASC,GAAwBX,EAAW,CAC1C,OAAOI,EAAWJ,EAAO,WAAW,GAAKI,EAAWJ,EAAO,cAAc,CAC3E,CAOA,SAASa,GAA0Bb,EAAW,CAC5C,OAAOI,EAAWJ,EAAO,EAAE,GAAKI,EAAWJ,EAAO,GAAG,CACvD,CAOA,SAASQ,GAAcR,EAAW,CAChC,OAAOI,EAAWJ,EAAO,gBAAgB,GAAKI,EAAWJ,EAAO,mBAAmB,CACrF,CCnMM,SAAUwB,GACdC,EACAC,EACAC,EAAsC,CAEtC,OAAIA,EACKH,GAAoBC,EAAYC,CAAa,EAAE,KAAKE,GAAiBD,CAAc,CAAC,EAGtF,IAAIE,EAAoB,SAACC,EAAU,CACxC,IAAMC,EAAU,UAAA,SAACC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EAAc,OAAAH,EAAW,KAAKE,EAAE,SAAW,EAAIA,EAAE,CAAC,EAAIA,CAAC,CAAzC,EACzBE,EAAWT,EAAWM,CAAO,EACnC,OAAOI,EAAWT,CAAa,EAAI,UAAA,CAAM,OAAAA,EAAcK,EAASG,CAAQ,CAA/B,EAAmC,MAC9E,CAAC,CACH,CCtBM,SAAUE,GACdC,EACAC,EACAC,EAAyC,CAFzCF,IAAA,SAAAA,EAAA,GAEAE,IAAA,SAAAA,EAAAC,IAIA,IAAIC,EAAmB,GAEvB,OAAIH,GAAuB,OAIrBI,GAAYJ,CAAmB,EACjCC,EAAYD,EAIZG,EAAmBH,GAIhB,IAAIK,EAAW,SAACC,EAAU,CAI/B,IAAIC,EAAMC,GAAYT,CAAO,EAAI,CAACA,EAAUE,EAAW,IAAG,EAAKF,EAE3DQ,EAAM,IAERA,EAAM,GAIR,IAAIE,EAAI,EAGR,OAAOR,EAAU,SAAS,UAAA,CACnBK,EAAW,SAEdA,EAAW,KAAKG,GAAG,EAEf,GAAKN,EAGP,KAAK,SAAS,OAAWA,CAAgB,EAGzCG,EAAW,SAAQ,EAGzB,EAAGC,CAAG,CACR,CAAC,CACH,CChGM,SAAUG,GAAK,SAACC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EACpB,IAAMC,EAAYC,GAAaH,CAAI,EAC7BI,EAAaC,GAAUL,EAAM,GAAQ,EACrCM,EAAUN,EAChB,OAAQM,EAAQ,OAGZA,EAAQ,SAAW,EAEnBC,EAAUD,EAAQ,CAAC,CAAC,EAEpBE,GAASJ,CAAU,EAAEK,GAAKH,EAASJ,CAAS,CAAC,EAL7CQ,CAMN,CCjEO,IAAMC,GAAQ,IAAIC,EAAkBC,EAAI,ECpCvC,IAAAC,GAAY,MAAK,QAMnB,SAAUC,GAAkBC,EAAiB,CACjD,OAAOA,EAAK,SAAW,GAAKF,GAAQE,EAAK,CAAC,CAAC,EAAIA,EAAK,CAAC,EAAKA,CAC5D,CCoDM,SAAUC,EAAUC,EAAiDC,EAAa,CACtF,OAAOC,EAAQ,SAACC,EAAQC,EAAU,CAEhC,IAAIC,EAAQ,EAIZF,EAAO,UAILG,EAAyBF,EAAY,SAACG,EAAK,CAAK,OAAAP,EAAU,KAAKC,EAASM,EAAOF,GAAO,GAAKD,EAAW,KAAKG,CAAK,CAAhE,CAAiE,CAAC,CAEtH,CAAC,CACH,CCxBM,SAAUC,IAAG,SAACC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EAClB,IAAMC,EAAiBC,GAAkBH,CAAI,EAEvCI,EAAUC,GAAeL,CAAI,EAEnC,OAAOI,EAAQ,OACX,IAAIE,EAAsB,SAACC,EAAU,CAGnC,IAAIC,EAAuBJ,EAAQ,IAAI,UAAA,CAAM,MAAA,CAAA,CAAA,CAAE,EAK3CK,EAAYL,EAAQ,IAAI,UAAA,CAAM,MAAA,EAAA,CAAK,EAGvCG,EAAW,IAAI,UAAA,CACbC,EAAUC,EAAY,IACxB,CAAC,EAKD,mBAASC,EAAW,CAClBC,EAAUP,EAAQM,CAAW,CAAC,EAAE,UAC9BE,EACEL,EACA,SAACM,EAAK,CAKJ,GAJAL,EAAQE,CAAW,EAAE,KAAKG,CAAK,EAI3BL,EAAQ,MAAM,SAACM,EAAM,CAAK,OAAAA,EAAO,MAAP,CAAa,EAAG,CAC5C,IAAMC,EAAcP,EAAQ,IAAI,SAACM,EAAM,CAAK,OAAAA,EAAO,MAAK,CAAZ,CAAe,EAE3DP,EAAW,KAAKL,EAAiBA,EAAc,MAAA,OAAAc,EAAA,CAAA,EAAAC,EAAIF,CAAM,CAAA,CAAA,EAAIA,CAAM,EAI/DP,EAAQ,KAAK,SAACM,EAAQI,EAAC,CAAK,MAAA,CAACJ,EAAO,QAAUL,EAAUS,CAAC,CAA7B,CAA8B,GAC5DX,EAAW,SAAQ,EAGzB,EACA,UAAA,CAGEE,EAAUC,CAAW,EAAI,GAIzB,CAACF,EAAQE,CAAW,EAAE,QAAUH,EAAW,SAAQ,CACrD,CAAC,CACF,GA9BIG,EAAc,EAAG,CAACH,EAAW,QAAUG,EAAcN,EAAQ,OAAQM,MAArEA,CAAW,EAmCpB,OAAO,UAAA,CACLF,EAAUC,EAAY,IACxB,CACF,CAAC,EACDU,CACN,CC9DM,SAAUC,GAASC,EAAoD,CAC3E,OAAOC,EAAQ,SAACC,EAAQC,EAAU,CAChC,IAAIC,EAAW,GACXC,EAAsB,KACtBC,EAA6C,KAC7CC,EAAa,GAEXC,EAAc,UAAA,CAGlB,GAFAF,GAAkB,MAAlBA,EAAoB,YAAW,EAC/BA,EAAqB,KACjBF,EAAU,CACZA,EAAW,GACX,IAAMK,EAAQJ,EACdA,EAAY,KACZF,EAAW,KAAKM,CAAK,EAEvBF,GAAcJ,EAAW,SAAQ,CACnC,EAEMO,EAAkB,UAAA,CACtBJ,EAAqB,KACrBC,GAAcJ,EAAW,SAAQ,CACnC,EAEAD,EAAO,UACLS,EACER,EACA,SAACM,EAAK,CACJL,EAAW,GACXC,EAAYI,EACPH,GACHM,EAAUZ,EAAiBS,CAAK,CAAC,EAAE,UAChCH,EAAqBK,EAAyBR,EAAYK,EAAaE,CAAe,CAAE,CAG/F,EACA,UAAA,CACEH,EAAa,IACZ,CAACH,GAAY,CAACE,GAAsBA,EAAmB,SAAWH,EAAW,SAAQ,CACxF,CAAC,CACF,CAEL,CAAC,CACH,CC3CM,SAAUU,GAAaC,EAAkBC,EAAyC,CAAzC,OAAAA,IAAA,SAAAA,EAAAC,IACtCC,GAAM,UAAA,CAAM,OAAAC,GAAMJ,EAAUC,CAAS,CAAzB,CAA0B,CAC/C,CCEM,SAAUI,GAAeC,EAAoBC,EAAsC,CAAtC,OAAAA,IAAA,SAAAA,EAAA,MAGjDA,EAAmBA,GAAgB,KAAhBA,EAAoBD,EAEhCE,EAAQ,SAACC,EAAQC,EAAU,CAChC,IAAIC,EAAiB,CAAA,EACjBC,EAAQ,EAEZH,EAAO,UACLI,EACEH,EACA,SAACI,EAAK,aACAC,EAAuB,KAKvBH,IAAUL,IAAsB,GAClCI,EAAQ,KAAK,CAAA,CAAE,MAIjB,QAAqBK,EAAAC,GAAAN,CAAO,EAAAO,EAAAF,EAAA,KAAA,EAAA,CAAAE,EAAA,KAAAA,EAAAF,EAAA,KAAA,EAAE,CAAzB,IAAMG,EAAMD,EAAA,MACfC,EAAO,KAAKL,CAAK,EAMbR,GAAca,EAAO,SACvBJ,EAASA,GAAM,KAANA,EAAU,CAAA,EACnBA,EAAO,KAAKI,CAAM,uGAItB,GAAIJ,MAIF,QAAqBK,EAAAH,GAAAF,CAAM,EAAAM,GAAAD,EAAA,KAAA,EAAA,CAAAC,GAAA,KAAAA,GAAAD,EAAA,KAAA,EAAE,CAAxB,IAAMD,EAAME,GAAA,MACfC,GAAUX,EAASQ,CAAM,EACzBT,EAAW,KAAKS,CAAM,wGAG5B,EACA,UAAA,aAGE,QAAqBI,EAAAN,GAAAN,CAAO,EAAAa,EAAAD,EAAA,KAAA,EAAA,CAAAC,EAAA,KAAAA,EAAAD,EAAA,KAAA,EAAE,CAAzB,IAAMJ,EAAMK,EAAA,MACfd,EAAW,KAAKS,CAAM,oGAExBT,EAAW,SAAQ,CACrB,EAEA,OACA,UAAA,CAEEC,EAAU,IACZ,CAAC,CACF,CAEL,CAAC,CACH,CCbM,SAAUc,GACdC,EAAgD,CAEhD,OAAOC,EAAQ,SAACC,EAAQC,EAAU,CAChC,IAAIC,EAAgC,KAChCC,EAAY,GACZC,EAEJF,EAAWF,EAAO,UAChBK,EAAyBJ,EAAY,OAAW,OAAW,SAACK,EAAG,CAC7DF,EAAgBG,EAAUT,EAASQ,EAAKT,GAAWC,CAAQ,EAAEE,CAAM,CAAC,CAAC,EACjEE,GACFA,EAAS,YAAW,EACpBA,EAAW,KACXE,EAAc,UAAUH,CAAU,GAIlCE,EAAY,EAEhB,CAAC,CAAC,EAGAA,IAMFD,EAAS,YAAW,EACpBA,EAAW,KACXE,EAAe,UAAUH,CAAU,EAEvC,CAAC,CACH,CC/HM,SAAUO,GACdC,EACAC,EACAC,EACAC,EACAC,EAAqC,CAErC,OAAO,SAACC,EAAuBC,EAA2B,CAIxD,IAAIC,EAAWL,EAIXM,EAAaP,EAEbQ,EAAQ,EAGZJ,EAAO,UACLK,EACEJ,EACA,SAACK,EAAK,CAEJ,IAAMC,EAAIH,IAEVD,EAAQD,EAEJP,EAAYQ,EAAOG,EAAOC,CAAC,GAIzBL,EAAW,GAAOI,GAGxBR,GAAcG,EAAW,KAAKE,CAAK,CACrC,EAGAJ,GACG,UAAA,CACCG,GAAYD,EAAW,KAAKE,CAAK,EACjCF,EAAW,SAAQ,CACrB,CAAE,CACL,CAEL,CACF,CCnCM,SAAUO,IAAa,SAAOC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EAClC,IAAMC,EAAiBC,GAAkBH,CAAI,EAC7C,OAAOE,EACHE,GAAKL,GAAa,MAAA,OAAAM,EAAA,CAAA,EAAAC,EAAKN,CAAoC,CAAA,CAAA,EAAGO,GAAiBL,CAAc,CAAC,EAC9FM,EAAQ,SAACC,EAAQC,EAAU,CACzBC,GAAiBN,EAAA,CAAEI,CAAM,EAAAH,EAAKM,GAAeZ,CAAI,CAAC,CAAA,CAAA,EAAGU,CAAU,CACjE,CAAC,CACP,CCUM,SAAUG,IAAiB,SAC/BC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EAEA,OAAOC,GAAa,MAAA,OAAAC,EAAA,CAAA,EAAAC,EAAIJ,CAAY,CAAA,CAAA,CACtC,CCkBM,SAAUK,GAAYC,EAAoD,CAC9E,OAAOC,EAAQ,SAACC,EAAQC,EAAU,CAChC,IAAIC,EAAW,GACXC,EAAsB,KAEtBC,EAA6C,KAE3CC,EAAO,UAAA,CAMX,GAFAD,GAAkB,MAAlBA,EAAoB,YAAW,EAC/BA,EAAqB,KACjBF,EAAU,CAEZA,EAAW,GACX,IAAMI,EAAQH,EACdA,EAAY,KACZF,EAAW,KAAKK,CAAK,EAEzB,EAEAN,EAAO,UACLO,EACEN,EACA,SAACK,EAAQ,CAIPF,GAAkB,MAAlBA,EAAoB,YAAW,EAC/BF,EAAW,GACXC,EAAYG,EAGZF,EAAqBG,EAAyBN,EAAYI,EAAMG,EAAI,EAEpEC,EAAUX,EAAiBQ,CAAK,CAAC,EAAE,UAAUF,CAAkB,CACjE,EACA,UAAA,CAGEC,EAAI,EACJJ,EAAW,SAAQ,CACrB,EAEA,OACA,UAAA,CAEEE,EAAYC,EAAqB,IACnC,CAAC,CACF,CAEL,CAAC,CACH,CCvDM,SAAUM,GAAgBC,EAAiBC,EAAyC,CAAzC,OAAAA,IAAA,SAAAA,EAAAC,IACxCC,EAAQ,SAACC,EAAQC,EAAU,CAChC,IAAIC,EAAkC,KAClCC,EAAsB,KACtBC,EAA0B,KAExBC,EAAO,UAAA,CACX,GAAIH,EAAY,CAEdA,EAAW,YAAW,EACtBA,EAAa,KACb,IAAMI,EAAQH,EACdA,EAAY,KACZF,EAAW,KAAKK,CAAK,EAEzB,EACA,SAASC,GAAY,CAInB,IAAMC,EAAaJ,EAAYR,EACzBa,EAAMZ,EAAU,IAAG,EACzB,GAAIY,EAAMD,EAAY,CAEpBN,EAAa,KAAK,SAAS,OAAWM,EAAaC,CAAG,EACtDR,EAAW,IAAIC,CAAU,EACzB,OAGFG,EAAI,CACN,CAEAL,EAAO,UACLU,EACET,EACA,SAACK,EAAQ,CACPH,EAAYG,EACZF,EAAWP,EAAU,IAAG,EAGnBK,IACHA,EAAaL,EAAU,SAASU,EAAcX,CAAO,EACrDK,EAAW,IAAIC,CAAU,EAE7B,EACA,UAAA,CAGEG,EAAI,EACJJ,EAAW,SAAQ,CACrB,EAEA,OACA,UAAA,CAEEE,EAAYD,EAAa,IAC3B,CAAC,CACF,CAEL,CAAC,CACH,CCpFM,SAAUS,GAAqBC,EAAe,CAClD,OAAOC,EAAQ,SAACC,EAAQC,EAAU,CAChC,IAAIC,EAAW,GACfF,EAAO,UACLG,EACEF,EACA,SAACG,EAAK,CACJF,EAAW,GACXD,EAAW,KAAKG,CAAK,CACvB,EACA,UAAA,CACOF,GACHD,EAAW,KAAKH,CAAa,EAE/BG,EAAW,SAAQ,CACrB,CAAC,CACF,CAEL,CAAC,CACH,CCXM,SAAUI,GAAQC,EAAa,CACnC,OAAOA,GAAS,EAEZ,UAAA,CAAM,OAAAC,CAAA,EACNC,EAAQ,SAACC,EAAQC,EAAU,CACzB,IAAIC,EAAO,EACXF,EAAO,UACLG,EAAyBF,EAAY,SAACG,EAAK,CAIrC,EAAEF,GAAQL,IACZI,EAAW,KAAKG,CAAK,EAIjBP,GAASK,GACXD,EAAW,SAAQ,EAGzB,CAAC,CAAC,CAEN,CAAC,CACP,CC9BM,SAAUI,GAAc,CAC5B,OAAOC,EAAQ,SAACC,EAAQC,EAAU,CAChCD,EAAO,UAAUE,EAAyBD,EAAYE,EAAI,CAAC,CAC7D,CAAC,CACH,CCCM,SAAUC,GAASC,EAAQ,CAC/B,OAAOC,EAAI,UAAA,CAAM,OAAAD,CAAA,CAAK,CACxB,CC4CM,SAAUE,GACdC,EACAC,EAAmC,CAEnC,OAAIA,EAEK,SAACC,EAAqB,CAC3B,OAAAC,GAAOF,EAAkB,KAAKG,GAAK,CAAC,EAAGC,EAAc,CAAE,EAAGH,EAAO,KAAKH,GAAUC,CAAqB,CAAC,CAAC,CAAvG,EAGGM,GAAS,SAACC,EAAOC,EAAK,CAAK,OAAAC,EAAUT,EAAsBO,EAAOC,CAAK,CAAC,EAAE,KAAKJ,GAAK,CAAC,EAAGM,GAAMH,CAAK,CAAC,CAAzE,CAA0E,CAC9G,CCzCM,SAAUI,GAASC,EAAoBC,EAAyC,CAAzCA,IAAA,SAAAA,EAAAC,IAC3C,IAAMC,EAAWC,GAAMJ,EAAKC,CAAS,EACrC,OAAOI,GAAU,UAAA,CAAM,OAAAF,CAAA,CAAQ,CACjC,CC0EM,SAAUG,EACdC,EACAC,EAA0D,CAA1D,OAAAA,IAAA,SAAAA,EAA+BC,IAK/BF,EAAaA,GAAU,KAAVA,EAAcG,GAEpBC,EAAQ,SAACC,EAAQC,EAAU,CAGhC,IAAIC,EAEAC,EAAQ,GAEZH,EAAO,UACLI,EAAyBH,EAAY,SAACI,EAAK,CAEzC,IAAMC,EAAaV,EAAYS,CAAK,GAKhCF,GAAS,CAACR,EAAYO,EAAaI,CAAU,KAM/CH,EAAQ,GACRD,EAAcI,EAGdL,EAAW,KAAKI,CAAK,EAEzB,CAAC,CAAC,CAEN,CAAC,CACH,CAEA,SAASP,GAAeS,EAAQC,EAAM,CACpC,OAAOD,IAAMC,CACf,CCjHM,SAAUC,EAA8CC,EAAQC,EAAuC,CAC3G,OAAOC,EAAqB,SAACC,EAAMC,EAAI,CAAK,OAAAH,EAAUA,EAAQE,EAAEH,CAAG,EAAGI,EAAEJ,CAAG,CAAC,EAAIG,EAAEH,CAAG,IAAMI,EAAEJ,CAAG,CAApD,CAAqD,CACnG,CC7BM,SAAUK,GAAgBC,EAA6C,CAA7C,OAAAA,IAAA,SAAAA,EAAAC,IACvBC,EAAQ,SAACC,EAAQC,EAAU,CAChC,IAAIC,EAAW,GACfF,EAAO,UACLG,EACEF,EACA,SAACG,EAAK,CACJF,EAAW,GACXD,EAAW,KAAKG,CAAK,CACvB,EACA,UAAA,CAAM,OAACF,EAAWD,EAAW,SAAQ,EAAKA,EAAW,MAAMJ,EAAY,CAAE,CAAnE,CAAqE,CAC5E,CAEL,CAAC,CACH,CAEA,SAASC,IAAmB,CAC1B,OAAO,IAAIO,EACb,CCMM,SAAUC,IAAO,SAAIC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EACzB,OAAO,SAACC,EAAqB,CAAK,OAAAC,GAAOD,EAAQE,EAAE,MAAA,OAAAC,EAAA,CAAA,EAAAC,EAAIN,CAAM,CAAA,CAAA,CAAA,CAA3B,CACpC,CCHM,SAAUO,EAAYC,EAAoB,CAC9C,OAAOC,EAAQ,SAACC,EAAQC,EAAU,CAGhC,GAAI,CACFD,EAAO,UAAUC,CAAU,UAE3BA,EAAW,IAAIH,CAAQ,EAE3B,CAAC,CACH,CCMM,SAAUI,GACdC,EACAC,EAAgB,CAEhB,IAAMC,EAAkB,UAAU,QAAU,EAC5C,OAAO,SAACC,EAAqB,CAC3B,OAAAA,EAAO,KACLH,EAAYI,EAAO,SAACC,EAAG,EAAC,CAAK,OAAAL,EAAUK,EAAG,EAAGF,CAAM,CAAtB,CAAuB,EAAIG,GACxDC,GAAK,CAAC,EACNL,EAAkBM,GAAeP,CAAa,EAAIQ,GAAa,UAAA,CAAM,OAAA,IAAIC,EAAJ,CAAgB,CAAC,CAHxF,CAKJ,CC/CM,SAAUC,GAAYC,EAAa,CACvC,OAAOA,GAAS,EACZ,UAAA,CAAM,OAAAC,CAAA,EACNC,EAAQ,SAACC,EAAQC,EAAU,CAKzB,IAAIC,EAAc,CAAA,EAClBF,EAAO,UACLG,EACEF,EACA,SAACG,EAAK,CAEJF,EAAO,KAAKE,CAAK,EAGjBP,EAAQK,EAAO,QAAUA,EAAO,MAAK,CACvC,EACA,UAAA,aAGE,QAAoBG,EAAAC,GAAAJ,CAAM,EAAAK,EAAAF,EAAA,KAAA,EAAA,CAAAE,EAAA,KAAAA,EAAAF,EAAA,KAAA,EAAE,CAAvB,IAAMD,EAAKG,EAAA,MACdN,EAAW,KAAKG,CAAK,oGAEvBH,EAAW,SAAQ,CACrB,EAEA,OACA,UAAA,CAEEC,EAAS,IACX,CAAC,CACF,CAEL,CAAC,CACP,CC1DM,SAAUM,IAAK,SAAIC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EACvB,IAAMC,EAAYC,GAAaH,CAAI,EAC7BI,EAAaC,GAAUL,EAAM,GAAQ,EAC3C,OAAAA,EAAOM,GAAeN,CAAI,EAEnBO,EAAQ,SAACC,EAAQC,EAAU,CAChCC,GAASN,CAAU,EAAEO,GAAIC,EAAA,CAAEJ,CAAM,EAAAK,EAAMb,CAA6B,CAAA,EAAGE,CAAS,CAAC,EAAE,UAAUO,CAAU,CACzG,CAAC,CACH,CCcM,SAAUK,IAAS,SACvBC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EAEA,OAAOC,GAAK,MAAA,OAAAC,EAAA,CAAA,EAAAC,EAAIJ,CAAY,CAAA,CAAA,CAC9B,CCmEM,SAAUK,GAAUC,EAAqC,OACzDC,EAAQ,IACRC,EAEJ,OAAIF,GAAiB,OACf,OAAOA,GAAkB,UACxBG,EAA4BH,EAAa,MAAzCC,EAAKE,IAAA,OAAG,IAAQA,EAAED,EAAUF,EAAa,OAE5CC,EAAQD,GAILC,GAAS,EACZ,UAAA,CAAM,OAAAG,CAAA,EACNC,EAAQ,SAACC,EAAQC,EAAU,CACzB,IAAIC,EAAQ,EACRC,EAEEC,EAAc,UAAA,CAGlB,GAFAD,GAAS,MAATA,EAAW,YAAW,EACtBA,EAAY,KACRP,GAAS,KAAM,CACjB,IAAMS,EAAW,OAAOT,GAAU,SAAWU,GAAMV,CAAK,EAAIW,EAAUX,EAAMM,CAAK,CAAC,EAC5EM,EAAqBC,EAAyBR,EAAY,UAAA,CAC9DO,EAAmB,YAAW,EAC9BE,EAAiB,CACnB,CAAC,EACDL,EAAS,UAAUG,CAAkB,OAErCE,EAAiB,CAErB,EAEMA,EAAoB,UAAA,CACxB,IAAIC,EAAY,GAChBR,EAAYH,EAAO,UACjBS,EAAyBR,EAAY,OAAW,UAAA,CAC1C,EAAEC,EAAQP,EACRQ,EACFC,EAAW,EAEXO,EAAY,GAGdV,EAAW,SAAQ,CAEvB,CAAC,CAAC,EAGAU,GACFP,EAAW,CAEf,EAEAM,EAAiB,CACnB,CAAC,CACP,CCpFM,SAAUE,GAAcC,EAA6DC,EAAQ,CAMjG,OAAOC,EAAQC,GAAcH,EAAaC,EAAW,UAAU,QAAU,EAAG,EAAI,CAAC,CACnF,CC+CM,SAAUG,GAASC,EAA4B,CAA5BA,IAAA,SAAAA,EAAA,CAAA,GACf,IAAAC,EAAgHD,EAAO,UAAvHE,EAASD,IAAA,OAAG,UAAA,CAAM,OAAA,IAAIE,CAAJ,EAAgBF,EAAEG,EAA4EJ,EAAO,aAAnFK,EAAYD,IAAA,OAAG,GAAIA,EAAEE,EAAuDN,EAAO,gBAA9DO,EAAeD,IAAA,OAAG,GAAIA,EAAEE,EAA+BR,EAAO,oBAAtCS,EAAmBD,IAAA,OAAG,GAAIA,EAUnH,OAAO,SAACE,EAAa,CACnB,IAAIC,EACAC,EACAC,EACAC,EAAW,EACXC,EAAe,GACfC,EAAa,GAEXC,GAAc,UAAA,CAClBL,GAAe,MAAfA,EAAiB,YAAW,EAC5BA,EAAkB,MACpB,EAGMM,GAAQ,UAAA,CACZD,GAAW,EACXN,EAAaE,EAAU,OACvBE,EAAeC,EAAa,EAC9B,EACMG,EAAsB,UAAA,CAG1B,IAAMC,EAAOT,EACbO,GAAK,EACLE,GAAI,MAAJA,EAAM,YAAW,CACnB,EAEA,OAAOC,EAAc,SAACC,EAAQC,GAAU,CACtCT,IACI,CAACE,GAAc,CAACD,GAClBE,GAAW,EAOb,IAAMO,GAAQX,EAAUA,GAAO,KAAPA,EAAWX,EAAS,EAO5CqB,GAAW,IAAI,UAAA,CACbT,IAKIA,IAAa,GAAK,CAACE,GAAc,CAACD,IACpCH,EAAkBa,GAAYN,EAAqBV,CAAmB,EAE1E,CAAC,EAIDe,GAAK,UAAUD,EAAU,EAGvB,CAACZ,GAIDG,EAAW,IAOXH,EAAa,IAAIe,GAAe,CAC9B,KAAM,SAACC,GAAK,CAAK,OAAAH,GAAK,KAAKG,EAAK,CAAf,EACjB,MAAO,SAACC,GAAG,CACTZ,EAAa,GACbC,GAAW,EACXL,EAAkBa,GAAYP,GAAOb,EAAcuB,EAAG,EACtDJ,GAAK,MAAMI,EAAG,CAChB,EACA,SAAU,UAAA,CACRb,EAAe,GACfE,GAAW,EACXL,EAAkBa,GAAYP,GAAOX,CAAe,EACpDiB,GAAK,SAAQ,CACf,EACD,EACDK,EAAUP,CAAM,EAAE,UAAUX,CAAU,EAE1C,CAAC,EAAED,CAAa,CAClB,CACF,CAEA,SAASe,GACPP,EACAY,EAAoD,SACpDC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,EAAA,CAAA,EAAA,UAAAA,CAAA,EAEA,GAAIF,IAAO,GAAM,CACfZ,EAAK,EACL,OAGF,GAAIY,IAAO,GAIX,KAAMG,EAAe,IAAIP,GAAe,CACtC,KAAM,UAAA,CACJO,EAAa,YAAW,EACxBf,EAAK,CACP,EACD,EAED,OAAOW,EAAUC,EAAE,MAAA,OAAAI,EAAA,CAAA,EAAAC,EAAIJ,CAAI,CAAA,CAAA,CAAA,EAAG,UAAUE,CAAY,EACtD,CChHM,SAAUG,EACdC,EACAC,EACAC,EAAyB,WAErBC,EACAC,EAAW,GACf,OAAIJ,GAAsB,OAAOA,GAAuB,UACnDK,EAA8EL,EAAkB,WAAhGG,EAAUE,IAAA,OAAG,IAAQA,EAAEC,EAAuDN,EAAkB,WAAzEC,EAAUK,IAAA,OAAG,IAAQA,EAAEC,EAAgCP,EAAkB,SAAlDI,EAAQG,IAAA,OAAG,GAAKA,EAAEL,EAAcF,EAAkB,WAEnGG,EAAcH,GAAkB,KAAlBA,EAAsB,IAE/BQ,GAAS,CACd,UAAW,UAAA,CAAM,OAAA,IAAIC,GAAcN,EAAYF,EAAYC,CAAS,CAAnD,EACjB,aAAc,GACd,gBAAiB,GACjB,oBAAqBE,EACtB,CACH,CCxIM,SAAUM,GAAQC,EAAa,CACnC,OAAOC,EAAO,SAACC,EAAGC,EAAK,CAAK,OAAAH,GAASG,CAAT,CAAc,CAC5C,CCaM,SAAUC,GAAaC,EAA8B,CACzD,OAAOC,EAAQ,SAACC,EAAQC,EAAU,CAChC,IAAIC,EAAS,GAEPC,EAAiBC,EACrBH,EACA,UAAA,CACEE,GAAc,MAAdA,EAAgB,YAAW,EAC3BD,EAAS,EACX,EACAG,EAAI,EAGNC,EAAUR,CAAQ,EAAE,UAAUK,CAAc,EAE5CH,EAAO,UAAUI,EAAyBH,EAAY,SAACM,EAAK,CAAK,OAAAL,GAAUD,EAAW,KAAKM,CAAK,CAA/B,CAAgC,CAAC,CACpG,CAAC,CACH,CCVM,SAAUC,GAAS,SAAOC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EAC9B,IAAMC,EAAYC,GAAaH,CAAM,EACrC,OAAOI,EAAQ,SAACC,EAAQC,EAAU,EAI/BJ,EAAYK,GAAOP,EAAQK,EAAQH,CAAS,EAAIK,GAAOP,EAAQK,CAAM,GAAG,UAAUC,CAAU,CAC/F,CAAC,CACH,CCmBM,SAAUE,EACdC,EACAC,EAA6G,CAE7G,OAAOC,EAAQ,SAACC,EAAQC,EAAU,CAChC,IAAIC,EAAyD,KACzDC,EAAQ,EAERC,EAAa,GAIXC,EAAgB,UAAA,CAAM,OAAAD,GAAc,CAACF,GAAmBD,EAAW,SAAQ,CAArD,EAE5BD,EAAO,UACLM,EACEL,EACA,SAACM,EAAK,CAEJL,GAAe,MAAfA,EAAiB,YAAW,EAC5B,IAAIM,EAAa,EACXC,EAAaN,IAEnBO,EAAUb,EAAQU,EAAOE,CAAU,CAAC,EAAE,UACnCP,EAAkBI,EACjBL,EAIA,SAACU,EAAU,CAAK,OAAAV,EAAW,KAAKH,EAAiBA,EAAeS,EAAOI,EAAYF,EAAYD,GAAY,EAAIG,CAAU,CAAzG,EAChB,UAAA,CAIET,EAAkB,KAClBG,EAAa,CACf,CAAC,CACD,CAEN,EACA,UAAA,CACED,EAAa,GACbC,EAAa,CACf,CAAC,CACF,CAEL,CAAC,CACH,CCvFM,SAAUO,EAAaC,EAA8B,CACzD,OAAOC,EAAQ,SAACC,EAAQC,EAAU,CAChCC,EAAUJ,CAAQ,EAAE,UAAUK,EAAyBF,EAAY,UAAA,CAAM,OAAAA,EAAW,SAAQ,CAAnB,EAAuBG,EAAI,CAAC,EACrG,CAACH,EAAW,QAAUD,EAAO,UAAUC,CAAU,CACnD,CAAC,CACH,CCIM,SAAUI,GAAaC,EAAiDC,EAAiB,CAAjB,OAAAA,IAAA,SAAAA,EAAA,IACrEC,EAAQ,SAACC,EAAQC,EAAU,CAChC,IAAIC,EAAQ,EACZF,EAAO,UACLG,EAAyBF,EAAY,SAACG,EAAK,CACzC,IAAMC,EAASR,EAAUO,EAAOF,GAAO,GACtCG,GAAUP,IAAcG,EAAW,KAAKG,CAAK,EAC9C,CAACC,GAAUJ,EAAW,SAAQ,CAChC,CAAC,CAAC,CAEN,CAAC,CACH,CCqGM,SAAUK,EACdC,EACAC,EACAC,EAA8B,CAK9B,IAAMC,EACJC,EAAWJ,CAAc,GAAKC,GAASC,EAElC,CAAE,KAAMF,EAA2E,MAAKC,EAAE,SAAQC,CAAA,EACnGF,EAEN,OAAOG,EACHE,EAAQ,SAACC,EAAQC,EAAU,QACzBC,EAAAL,EAAY,aAAS,MAAAK,IAAA,QAAAA,EAAA,KAArBL,CAAW,EACX,IAAIM,EAAU,GACdH,EAAO,UACLI,EACEH,EACA,SAACI,EAAK,QACJH,EAAAL,EAAY,QAAI,MAAAK,IAAA,QAAAA,EAAA,KAAhBL,EAAmBQ,CAAK,EACxBJ,EAAW,KAAKI,CAAK,CACvB,EACA,UAAA,OACEF,EAAU,IACVD,EAAAL,EAAY,YAAQ,MAAAK,IAAA,QAAAA,EAAA,KAApBL,CAAW,EACXI,EAAW,SAAQ,CACrB,EACA,SAACK,EAAG,OACFH,EAAU,IACVD,EAAAL,EAAY,SAAK,MAAAK,IAAA,QAAAA,EAAA,KAAjBL,EAAoBS,CAAG,EACvBL,EAAW,MAAMK,CAAG,CACtB,EACA,UAAA,SACMH,KACFD,EAAAL,EAAY,eAAW,MAAAK,IAAA,QAAAA,EAAA,KAAvBL,CAAW,IAEbU,EAAAV,EAAY,YAAQ,MAAAU,IAAA,QAAAA,EAAA,KAApBV,CAAW,CACb,CAAC,CACF,CAEL,CAAC,EAIDW,EACN,CCnIM,SAAUC,GAAYC,EAAsDC,EAAuB,CACvG,OAAOC,EAAQ,SAACC,EAAQC,EAAU,CAC1B,IAAAC,EAAuCJ,GAAM,KAANA,EAAU,CAAA,EAA/CK,EAAAD,EAAA,QAAAE,EAAOD,IAAA,OAAG,GAAIA,EAAEE,EAAAH,EAAA,SAAAI,EAAQD,IAAA,OAAG,GAAKA,EACpCE,EAAW,GACXC,EAAsB,KACtBC,EAAiC,KACjCC,EAAa,GAEXC,EAAgB,UAAA,CACpBF,GAAS,MAATA,EAAW,YAAW,EACtBA,EAAY,KACRH,IACFM,GAAI,EACJF,GAAcT,EAAW,SAAQ,EAErC,EAEMY,EAAoB,UAAA,CACxBJ,EAAY,KACZC,GAAcT,EAAW,SAAQ,CACnC,EAEMa,EAAgB,SAACC,GAAQ,CAC7B,OAACN,EAAYO,EAAUnB,EAAiBkB,EAAK,CAAC,EAAE,UAAUE,EAAyBhB,EAAYU,EAAeE,CAAiB,CAAC,CAAhI,EAEID,GAAO,UAAA,CACX,GAAIL,EAAU,CAIZA,EAAW,GACX,IAAMQ,GAAQP,EACdA,EAAY,KAEZP,EAAW,KAAKc,EAAK,EACrB,CAACL,GAAcI,EAAcC,EAAK,EAEtC,EAEAf,EAAO,UACLiB,EACEhB,EAMA,SAACc,GAAK,CACJR,EAAW,GACXC,EAAYO,GACZ,EAAEN,GAAa,CAACA,EAAU,UAAYL,EAAUQ,GAAI,EAAKE,EAAcC,EAAK,EAC9E,EACA,UAAA,CACEL,EAAa,GACb,EAAEJ,GAAYC,GAAYE,GAAa,CAACA,EAAU,SAAWR,EAAW,SAAQ,CAClF,CAAC,CACF,CAEL,CAAC,CACH,CCxFM,SAAUiB,GACdC,EACAC,EACAC,EAAuB,CADvBD,IAAA,SAAAA,EAAAE,IAGA,IAAMC,EAAYC,GAAML,EAAUC,CAAS,EAC3C,OAAOK,GAAS,UAAA,CAAM,OAAAF,CAAA,EAAWF,CAAM,CACzC,CCJM,SAAUK,IAAc,SAAOC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EACnC,IAAMC,EAAUC,GAAkBH,CAAM,EAExC,OAAOI,EAAQ,SAACC,EAAQC,EAAU,CAehC,QAdMC,EAAMP,EAAO,OACbQ,EAAc,IAAI,MAAMD,CAAG,EAI7BE,EAAWT,EAAO,IAAI,UAAA,CAAM,MAAA,EAAA,CAAK,EAGjCU,EAAQ,cAMHC,EAAC,CACRC,EAAUZ,EAAOW,CAAC,CAAC,EAAE,UACnBE,EACEP,EACA,SAACQ,EAAK,CACJN,EAAYG,CAAC,EAAIG,EACb,CAACJ,GAAS,CAACD,EAASE,CAAC,IAEvBF,EAASE,CAAC,EAAI,IAKbD,EAAQD,EAAS,MAAMM,EAAQ,KAAON,EAAW,MAEtD,EAGAO,EAAI,CACL,GAnBIL,EAAI,EAAGA,EAAIJ,EAAKI,MAAhBA,CAAC,EAwBVN,EAAO,UACLQ,EAAyBP,EAAY,SAACQ,EAAK,CACzC,GAAIJ,EAAO,CAET,IAAMO,EAAMC,EAAA,CAAIJ,CAAK,EAAAK,EAAKX,CAAW,CAAA,EACrCF,EAAW,KAAKJ,EAAUA,EAAO,MAAA,OAAAgB,EAAA,CAAA,EAAAC,EAAIF,CAAM,CAAA,CAAA,EAAIA,CAAM,EAEzD,CAAC,CAAC,CAEN,CAAC,CACH,CCxFM,SAAUG,IAAG,SAAOC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EACxB,OAAOC,EAAQ,SAACC,EAAQC,EAAU,CAChCL,GAAS,MAAA,OAAAM,EAAA,CAACF,CAA8B,EAAAG,EAAMN,CAAuC,CAAA,CAAA,EAAE,UAAUI,CAAU,CAC7G,CAAC,CACH,CCCM,SAAUG,IAAO,SAAkCC,EAAA,CAAA,EAAAC,EAAA,EAAAA,EAAA,UAAA,OAAAA,IAAAD,EAAAC,CAAA,EAAA,UAAAA,CAAA,EACvD,OAAOC,GAAG,MAAA,OAAAC,EAAA,CAAA,EAAAC,EAAIJ,CAAW,CAAA,CAAA,CAC3B,CCYO,SAASK,IAAmC,CACjD,IAAMC,EAAY,IAAIC,GAAwB,CAAC,EAC/C,OAAAC,EAAU,SAAU,mBAAoB,CAAE,KAAM,EAAK,CAAC,EACnD,UAAU,IAAMF,EAAU,KAAK,QAAQ,CAAC,EAGpCA,CACT,CCHO,SAASG,EACdC,EAAkBC,EAAmB,SAChC,CACL,OAAO,MAAM,KAAKA,EAAK,iBAAoBD,CAAQ,CAAC,CACtD,CAuBO,SAASE,EACdF,EAAkBC,EAAmB,SAClC,CACH,IAAME,EAAKC,GAAsBJ,EAAUC,CAAI,EAC/C,GAAI,OAAOE,GAAO,YAChB,MAAM,IAAI,eACR,8BAA8BH,CAAQ,iBACxC,EAGF,OAAOG,CACT,CAsBO,SAASC,GACdJ,EAAkBC,EAAmB,SACtB,CACf,OAAOA,EAAK,cAAiBD,CAAQ,GAAK,MAC5C,CAOO,SAASK,IAA4C,CAnH5D,IAAAC,EAAAC,EAAAC,EAAAC,EAoHE,OACEA,GAAAD,GAAAD,GAAAD,EAAA,SAAS,gBAAT,YAAAA,EAAwB,aAAxB,YAAAC,EAAoC,gBAApC,KAAAC,EACA,SAAS,gBADT,KAAAC,EAEA,MAEJ,CCvEA,IAAMC,GAAYC,EAChBC,EAAU,SAAS,KAAM,SAAS,EAClCA,EAAU,SAAS,KAAM,UAAU,CACrC,EACG,KACCC,GAAa,CAAC,EACdC,EAAU,MAAS,EACnBC,EAAI,IAAMC,GAAiB,GAAK,SAAS,IAAI,EAC7CC,EAAY,CAAC,CACf,EAaK,SAASC,GACdC,EACqB,CACrB,OAAOT,GACJ,KACCK,EAAIK,GAAUD,EAAG,SAASC,CAAM,CAAC,EACjCC,EAAqB,CACvB,CACJ,CC7BO,SAASC,GACdC,EAAiBC,EACI,CACrB,OAAOC,EAAM,IAAMC,EACjBC,EAAUJ,EAAI,YAAY,EAAE,KAAKK,EAAI,IAAM,EAAI,CAAC,EAChDD,EAAUJ,EAAI,YAAY,EAAE,KAAKK,EAAI,IAAM,EAAK,CAAC,CACnD,EACG,KACCJ,EAAUK,GAASC,GAAUC,GAAM,CAAC,CAACD,EAASN,CAAO,CAAC,EAAIQ,GAC1DC,EAAUV,EAAG,QAAQ,QAAQ,CAAC,CAChC,CACF,CACF,CCPA,SAASW,GAAYC,EAAiBC,EAA8B,CAGlE,GAAI,OAAOA,GAAU,UAAY,OAAOA,GAAU,SAChDD,EAAG,WAAaC,EAAM,SAAS,UAGtBA,aAAiB,KAC1BD,EAAG,YAAYC,CAAK,UAGX,MAAM,QAAQA,CAAK,EAC5B,QAAWC,KAAQD,EACjBF,GAAYC,EAAIE,CAAI,CAE1B,CAyBO,SAASC,EACdC,EAAaC,KAAmCC,EAC7C,CACH,IAAMN,EAAK,SAAS,cAAcI,CAAG,EAGrC,GAAIC,EACF,QAAWE,KAAQ,OAAO,KAAKF,CAAU,EACnC,OAAOA,EAAWE,CAAI,GAAM,cAI5B,OAAOF,EAAWE,CAAI,GAAM,UAC9BP,EAAG,aAAaO,EAAMF,EAAWE,CAAI,CAAC,EAEtCP,EAAG,aAAaO,EAAM,EAAE,GAI9B,QAAWN,KAASK,EAClBP,GAAYC,EAAIC,CAAK,EAGvB,OAAOD,CACT,CC9EO,SAASQ,GAAMC,EAAuB,CAC3C,GAAIA,EAAQ,IAAK,CACf,IAAMC,EAAS,GAAGD,EAAQ,KAAO,IAAO,IACxC,MAAO,KAAKA,EAAQ,MAAY,KAAM,QAAQC,CAAM,CAAC,GACvD,KACE,QAAOD,EAAM,SAAS,CAE1B,CCCO,SAASE,GAAYC,EAA+B,CACzD,IAAMC,EAASC,EAAE,SAAU,CAAE,IAAAF,CAAI,CAAC,EAClC,OAAOG,EAAM,KACX,SAAS,KAAK,YAAYF,CAAM,EACzBG,EACLC,EAAUJ,EAAQ,MAAM,EACxBI,EAAUJ,EAAQ,OAAO,EACtB,KACCK,EAAU,IACRC,GAAW,IAAM,IAAI,eAAe,mBAAmBP,CAAG,EAAE,CAAC,CAC9D,CACH,CACJ,EACG,KACCQ,EAAI,IAAG,EAAY,EACnBC,EAAS,IAAM,SAAS,KAAK,YAAYR,CAAM,CAAC,EAChDS,GAAK,CAAC,CACR,EACH,CACH,CCVA,IAAMC,GAAS,IAAIC,EAiBbC,GAAYC,EAAM,IACtB,OAAO,gBAAmB,YACtBC,GAAY,4CAA4C,EACxDC,EAAG,MAAS,CACjB,EACE,KACCC,EAAI,IAAM,IAAI,eAAeC,GAC3BA,EAAQ,QAAQC,GAASR,GAAO,KAAKQ,CAAK,CAAC,CAC5C,CAAC,EACFC,EAAUC,GAAYC,EAAMC,GAAOP,EAAGK,CAAQ,CAAC,EAAE,KAC/CG,EAAS,IAAMH,EAAS,WAAW,CAAC,CACtC,CAAC,EACDI,EAAY,CAAC,CACf,EAaK,SAASC,GACdC,EACa,CACb,MAAO,CACL,MAAQA,EAAG,YACX,OAAQA,EAAG,YACb,CACF,CAuBO,SAASC,GACdD,EACyB,CAMzB,IAAIE,EAASF,EACb,KAAOE,EAAO,cAAgB,GACxBA,EAAO,eACTA,EAASA,EAAO,cAMpB,OAAOhB,GAAU,KACfiB,EAAIT,GAAYA,EAAS,QAAQQ,CAAM,CAAC,EACxCT,EAAUC,GAAYV,GAAO,KAC3BoB,EAAOZ,GAASA,EAAM,SAAWU,CAAM,EACvCL,EAAS,IAAMH,EAAS,UAAUQ,CAAM,CAAC,CAC3C,CAAC,EACDZ,EAAI,IAAMS,GAAeC,CAAE,CAAC,EAC5BK,EAAUN,GAAeC,CAAE,CAAC,CAC9B,CACF,CC3HO,SAASM,GACdC,EACa,CACb,MAAO,CACL,MAAQA,EAAG,YACX,OAAQA,EAAG,YACb,CACF,CASO,SAASC,GACdD,EACyB,CACzB,IAAIE,EAASF,EAAG,cAChB,KAAOE,IAEHF,EAAG,aAAgBE,EAAO,aAC1BF,EAAG,cAAgBE,EAAO,eAE1BA,GAAUF,EAAKE,GAAQ,cAK3B,OAAOA,EAASF,EAAK,MACvB,CAYO,SAASG,GACdH,EACe,CACf,IAAMI,EAA4B,CAAC,EAG/BF,EAASF,EAAG,cAChB,KAAOE,IAEHF,EAAG,YAAeE,EAAO,aACzBF,EAAG,aAAeE,EAAO,eAEzBE,EAAW,KAAKF,CAAM,EAGxBA,GAAUF,EAAKE,GAAQ,cAKzB,OAAIE,EAAW,SAAW,GACxBA,EAAW,KAAK,SAAS,eAAe,EAGnCA,CACT,CC9CO,SAASC,GACdC,EACe,CACf,MAAO,CACL,EAAGA,EAAG,WACN,EAAGA,EAAG,SACR,CACF,CASO,SAASC,GACdD,EACe,CACf,IAAME,EAAOF,EAAG,sBAAsB,EACtC,MAAO,CACL,EAAGE,EAAK,EAAI,OAAO,QACnB,EAAGA,EAAK,EAAI,OAAO,OACrB,CACF,CAWO,SAASC,GACdH,EAC2B,CAC3B,OAAOI,EACLC,EAAU,OAAQ,MAAM,EACxBA,EAAU,OAAQ,QAAQ,CAC5B,EACG,KACCC,GAAU,EAAGC,EAAuB,EACpCC,EAAI,IAAMT,GAAiBC,CAAE,CAAC,EAC9BS,EAAUV,GAAiBC,CAAE,CAAC,CAChC,CACJ,CC3DO,SAASU,GACdC,EACe,CACf,MAAO,CACL,EAAGA,EAAG,WACN,EAAGA,EAAG,SACR,CACF,CAWO,SAASC,GACdD,EAC2B,CAC3B,OAAOE,EACLC,EAAUH,EAAI,QAAQ,EACtBG,EAAU,OAAQ,QAAQ,EAC1BA,EAAU,OAAQ,QAAQ,CAC5B,EACG,KACCC,GAAU,EAAGC,EAAuB,EACpCC,EAAI,IAAMP,GAAwBC,CAAE,CAAC,EACrCO,EAAUR,GAAwBC,CAAE,CAAC,CACvC,CACJ,CCzBA,IAAMQ,GAAS,IAAIC,EAUbC,GAAYC,EAAM,IAAMC,EAC5B,IAAI,qBAAqBC,GAAW,CAClC,QAAWC,KAASD,EAClBL,GAAO,KAAKM,CAAK,CACrB,EAAG,CACD,UAAW,CACb,CAAC,CACH,CAAC,EACE,KACCC,EAAUC,GAAYC,EAAMC,GAAON,EAAGI,CAAQ,CAAC,EAC5C,KACCG,EAAS,IAAMH,EAAS,WAAW,CAAC,CACtC,CACF,EACAI,EAAY,CAAC,CACf,EAaK,SAASC,GACdC,EACqB,CACrB,OAAOZ,GACJ,KACCa,EAAIP,GAAYA,EAAS,QAAQM,CAAE,CAAC,EACpCP,EAAUC,GAAYR,GACnB,KACCgB,EAAO,CAAC,CAAE,OAAAC,CAAO,IAAMA,IAAWH,CAAE,EACpCH,EAAS,IAAMH,EAAS,UAAUM,CAAE,CAAC,EACrCI,EAAI,CAAC,CAAE,eAAAC,CAAe,IAAMA,CAAc,CAC5C,CACF,CACF,CACJ,CAaO,SAASC,GACdN,EAAiBO,EAAY,GACR,CACrB,OAAOC,GAA0BR,CAAE,EAChC,KACCI,EAAI,CAAC,CAAE,EAAAK,CAAE,IAAM,CACb,IAAMC,EAAUC,GAAeX,CAAE,EAC3BY,EAAUC,GAAsBb,CAAE,EACxC,OAAOS,GACLG,EAAQ,OAASF,EAAQ,OAASH,CAEtC,CAAC,EACDO,EAAqB,CACvB,CACJ,CCjFA,IAAMC,GAA4C,CAChD,OAAQC,EAAW,yBAAyB,EAC5C,OAAQA,EAAW,yBAAyB,CAC9C,EAaO,SAASC,GAAUC,EAAuB,CAC/C,OAAOH,GAAQG,CAAI,EAAE,OACvB,CAaO,SAASC,GAAUD,EAAcE,EAAsB,CACxDL,GAAQG,CAAI,EAAE,UAAYE,GAC5BL,GAAQG,CAAI,EAAE,MAAM,CACxB,CAWO,SAASG,GAAYH,EAAmC,CAC7D,IAAMI,EAAKP,GAAQG,CAAI,EACvB,OAAOK,EAAUD,EAAI,QAAQ,EAC1B,KACCE,EAAI,IAAMF,EAAG,OAAO,EACpBG,EAAUH,EAAG,OAAO,CACtB,CACJ,CC9BA,SAASI,GACPC,EAAiBC,EACR,CACT,OAAQD,EAAG,YAAa,CAGtB,KAAK,iBAEH,OAAIA,EAAG,OAAS,QACP,SAAS,KAAKC,CAAI,EAElB,GAGX,KAAK,kBACL,KAAK,oBACH,MAAO,GAGT,QACE,OAAOD,EAAG,iBACd,CACF,CAWO,SAASE,IAAwC,CACtD,OAAOC,EACLC,EAAU,OAAQ,kBAAkB,EAAE,KAAKC,EAAI,IAAM,EAAI,CAAC,EAC1DD,EAAU,OAAQ,gBAAgB,EAAE,KAAKC,EAAI,IAAM,EAAK,CAAC,CAC3D,EACG,KACCC,EAAU,EAAK,CACjB,CACJ,CAOO,SAASC,IAAsC,CACpD,IAAMC,EAAYJ,EAAyB,OAAQ,SAAS,EACzD,KACCK,EAAOC,GAAM,EAAEA,EAAG,SAAWA,EAAG,QAAQ,EACxCL,EAAIK,IAAO,CACT,KAAMC,GAAU,QAAQ,EAAI,SAAW,SACvC,KAAMD,EAAG,IACT,OAAQ,CACNA,EAAG,eAAe,EAClBA,EAAG,gBAAgB,CACrB,CACF,EAAc,EACdD,EAAO,CAAC,CAAE,KAAAG,EAAM,KAAAX,CAAK,IAAM,CACzB,GAAIW,IAAS,SAAU,CACrB,IAAMC,EAASC,GAAiB,EAChC,GAAI,OAAOD,GAAW,YACpB,MAAO,CAACd,GAAwBc,EAAQZ,CAAI,CAChD,CACA,MAAO,EACT,CAAC,EACDc,GAAM,CACR,EAGF,OAAOb,GAAiB,EACrB,KACCc,EAAUH,GAAWA,EAAqBI,EAAZT,CAAiB,CACjD,CACJ,CC1GO,SAASU,IAAmB,CACjC,OAAO,IAAI,IAAI,SAAS,IAAI,CAC9B,CAgBO,SAASC,GACdC,EAA4BC,EAAW,GACjC,CACN,GAAIC,EAAQ,oBAAoB,GAAK,CAACD,EAAU,CAC9C,IAAME,EAAKC,EAAE,IAAK,CAAE,KAAMJ,EAAI,IAAK,CAAC,EACpC,SAAS,KAAK,YAAYG,CAAE,EAC5BA,EAAG,MAAM,EACTA,EAAG,OAAO,CAIZ,MACE,SAAS,KAAOH,EAAI,IAExB,CASO,SAASK,IAA8B,CAC5C,OAAO,IAAIC,CACb,CCxCO,SAASC,IAA0B,CACxC,OAAO,SAAS,KAAK,MAAM,CAAC,CAC9B,CAYO,SAASC,GAAgBC,EAAoB,CAClD,IAAMC,EAAKC,EAAE,IAAK,CAAE,KAAMF,CAAK,CAAC,EAChCC,EAAG,iBAAiB,QAASE,GAAMA,EAAG,gBAAgB,CAAC,EACvDF,EAAG,MAAM,CACX,CAWO,SAASG,GACdC,EACoB,CACpB,OAAOC,EACLC,EAA2B,OAAQ,YAAY,EAC/CF,CACF,EACG,KACCG,EAAIV,EAAe,EACnBW,EAAUX,GAAgB,CAAC,EAC3BY,EAAOV,GAAQA,EAAK,OAAS,CAAC,EAC9BW,EAAY,CAAC,CACf,CACJ,CASO,SAASC,GACdP,EACyB,CACzB,OAAOD,GAAkBC,CAAS,EAC/B,KACCG,EAAIK,GAAMC,GAAmB,QAAQD,CAAE,IAAI,CAAE,EAC7CH,EAAOT,GAAM,OAAOA,GAAO,WAAW,CACxC,CACJ,CCtDO,SAASc,GAAWC,EAAoC,CAC7D,IAAMC,EAAQ,WAAWD,CAAK,EAC9B,OAAOE,GAA0BC,GAC/BF,EAAM,YAAY,IAAME,EAAKF,EAAM,OAAO,CAAC,CAC5C,EACE,KACCG,EAAUH,EAAM,OAAO,CACzB,CACJ,CAOO,SAASI,IAAkC,CAChD,IAAMJ,EAAQ,WAAW,OAAO,EAChC,OAAOK,EACLC,EAAU,OAAQ,aAAa,EAAE,KAAKC,EAAI,IAAM,EAAI,CAAC,EACrDD,EAAU,OAAQ,YAAY,EAAE,KAAKC,EAAI,IAAM,EAAK,CAAC,CACvD,EACG,KACCJ,EAAUH,EAAM,OAAO,CACzB,CACJ,CAcO,SAASQ,GACdC,EAA6BC,EACd,CACf,OAAOD,EACJ,KACCE,EAAUC,GAAUA,EAASF,EAAQ,EAAIG,CAAK,CAChD,CACJ,CC/BO,SAASC,GACdC,EAAmBC,EACD,CAClB,OAAO,IAAIC,EAAiBC,GAAY,CACtC,IAAMC,EAAM,IAAI,eAChB,OAAAA,EAAI,KAAK,MAAO,GAAGJ,CAAG,EAAE,EACxBI,EAAI,aAAe,OAGnBA,EAAI,iBAAiB,OAAQ,IAAM,CAC7BA,EAAI,QAAU,KAAOA,EAAI,OAAS,KACpCD,EAAS,KAAKC,EAAI,QAAQ,EAC1BD,EAAS,SAAS,GAIlBA,EAAS,MAAM,IAAI,MAAMC,EAAI,UAAU,CAAC,CAE5C,CAAC,EAGDA,EAAI,iBAAiB,QAAS,IAAM,CAClCD,EAAS,MAAM,IAAI,MAAM,eAAe,CAAC,CAC3C,CAAC,EAGDC,EAAI,iBAAiB,QAAS,IAAM,CAClCD,EAAS,SAAS,CACpB,CAAC,EAGG,OAAOF,GAAA,YAAAA,EAAS,YAAc,cAChCG,EAAI,iBAAiB,WAAYC,GAAS,CA/FhD,IAAAC,EAgGQ,GAAID,EAAM,iBACRJ,EAAQ,UAAW,KAAMI,EAAM,OAASA,EAAM,MAAS,GAAG,MAIrD,CACL,IAAME,GAASD,EAAAF,EAAI,kBAAkB,gBAAgB,IAAtC,KAAAE,EAA2C,EAC1DL,EAAQ,UAAW,KAAMI,EAAM,OAAS,CAACE,EAAU,GAAG,CACxD,CACF,CAAC,EAGDN,EAAQ,UAAU,KAAK,CAAC,GAI1BG,EAAI,KAAK,EACF,IAAMA,EAAI,MAAM,CACzB,CAAC,CACH,CAcO,SAASI,GACdR,EAAmBC,EACJ,CACf,OAAOF,GAAQC,EAAKC,CAAO,EACxB,KACCQ,EAAUC,GAAOA,EAAI,KAAK,CAAC,EAC3BC,EAAIC,GAAQ,KAAK,MAAMA,CAAI,CAAM,EACjCC,EAAY,CAAC,CACf,CACJ,CAUO,SAASC,GACdd,EAAmBC,EACG,CACtB,IAAMc,EAAM,IAAI,UAChB,OAAOhB,GAAQC,EAAKC,CAAO,EACxB,KACCQ,EAAUC,GAAOA,EAAI,KAAK,CAAC,EAC3BC,EAAID,GAAOK,EAAI,gBAAgBL,EAAK,WAAW,CAAC,EAChDG,EAAY,CAAC,CACf,CACJ,CAUO,SAASG,GACdhB,EAAmBC,EACG,CACtB,IAAMc,EAAM,IAAI,UAChB,OAAOhB,GAAQC,EAAKC,CAAO,EACxB,KACCQ,EAAUC,GAAOA,EAAI,KAAK,CAAC,EAC3BC,EAAID,GAAOK,EAAI,gBAAgBL,EAAK,UAAU,CAAC,EAC/CG,EAAY,CAAC,CACf,CACJ,CC5HO,SAASI,IAAoC,CAClD,MAAO,CACL,EAAG,KAAK,IAAI,EAAG,OAAO,EACtB,EAAG,KAAK,IAAI,EAAG,OAAO,CACxB,CACF,CASO,SAASC,IAAkD,CAChE,OAAOC,EACLC,EAAU,OAAQ,SAAU,CAAE,QAAS,EAAK,CAAC,EAC7CA,EAAU,OAAQ,SAAU,CAAE,QAAS,EAAK,CAAC,CAC/C,EACG,KACCC,EAAIJ,EAAiB,EACrBK,EAAUL,GAAkB,CAAC,CAC/B,CACJ,CC3BO,SAASM,IAAgC,CAC9C,MAAO,CACL,MAAQ,WACR,OAAQ,WACV,CACF,CASO,SAASC,IAA8C,CAC5D,OAAOC,EAAU,OAAQ,SAAU,CAAE,QAAS,EAAK,CAAC,EACjD,KACCC,EAAIH,EAAe,EACnBI,EAAUJ,GAAgB,CAAC,CAC7B,CACJ,CCXO,SAASK,IAAsC,CACpD,OAAOC,EAAc,CACnBC,GAAoB,EACpBC,GAAkB,CACpB,CAAC,EACE,KACCC,EAAI,CAAC,CAACC,EAAQC,CAAI,KAAO,CAAE,OAAAD,EAAQ,KAAAC,CAAK,EAAE,EAC1CC,EAAY,CAAC,CACf,CACJ,CCVO,SAASC,GACdC,EAAiB,CAAE,UAAAC,EAAW,QAAAC,CAAQ,EAChB,CACtB,IAAMC,EAAQF,EACX,KACCG,EAAwB,MAAM,CAChC,EAGIC,EAAUC,EAAc,CAACH,EAAOD,CAAO,CAAC,EAC3C,KACCK,EAAI,IAAMC,GAAiBR,CAAE,CAAC,CAChC,EAGF,OAAOM,EAAc,CAACJ,EAASD,EAAWI,CAAO,CAAC,EAC/C,KACCE,EAAI,CAAC,CAAC,CAAE,OAAAE,CAAO,EAAG,CAAE,OAAAC,EAAQ,KAAAC,CAAK,EAAG,CAAE,EAAAC,EAAG,EAAAC,CAAE,CAAC,KAAO,CACjD,OAAQ,CACN,EAAGH,EAAO,EAAIE,EACd,EAAGF,EAAO,EAAIG,EAAIJ,CACpB,EACA,KAAAE,CACF,EAAE,CACJ,CACJ,CCzBA,SAASG,GAAQC,EAA+B,CAC9C,OAAOC,EAA8BD,EAAQ,UAAWE,GAAMA,EAAG,IAAI,CACvE,CAWA,SAASC,GAAQH,EAA4B,CAC3C,IAAMI,EAAQ,IAAIC,EAClB,OAAAD,EAAM,UAAUE,GAAQN,EAAO,YAAYM,CAAI,CAAC,EAGzCF,CACT,CAgBO,SAASG,GACdC,EAAaR,EAAS,IAAI,OAAOQ,CAAG,EACxB,CACZ,IAAMC,EAAQV,GAAQC,CAAM,EACtBI,EAAQD,GAAQH,CAAM,EAGtBU,EAAU,IAAIL,EACpBK,EAAQ,UAAUN,CAAK,EAGvB,IAAMO,EAAQP,EAAM,KAAKQ,EAAe,EAAGC,GAAQ,EAAI,CAAC,EACxD,OAAOH,EACJ,KACCE,EAAe,EACfE,GAAUL,EAAM,KAAKM,EAAUJ,CAAK,CAAC,CAAC,EACtCK,GAAM,CACR,CACJ,CCJA,IAAMC,GAASC,EAAW,WAAW,EAC/BC,GAAiB,KAAK,MAAMF,GAAO,WAAY,EACrDE,GAAO,KAAO,GAAG,IAAI,IAAIA,GAAO,KAAMC,GAAY,CAAC,CAAC,GAW7C,SAASC,IAAwB,CACtC,OAAOF,EACT,CASO,SAASG,EAAQC,EAAqB,CAC3C,OAAOJ,GAAO,SAAS,SAASI,CAAI,CACtC,CAUO,SAASC,GACdC,EAAkBC,EACV,CACR,OAAO,OAAOA,GAAU,YACpBP,GAAO,aAAaM,CAAG,EAAE,QAAQ,IAAKC,EAAM,SAAS,CAAC,EACtDP,GAAO,aAAaM,CAAG,CAC7B,CChCO,SAASE,GACdC,EAASC,EAAmB,SACP,CACrB,OAAOC,EAAW,sBAAsBF,CAAI,IAAKC,CAAI,CACvD,CAYO,SAASE,GACdH,EAASC,EAAmB,SACL,CACvB,OAAOG,EAAY,sBAAsBJ,CAAI,IAAKC,CAAI,CACxD,CC7EO,SAASI,GACdC,EACsB,CACtB,IAAMC,EAASC,EAAW,6BAA8BF,CAAE,EAC1D,OAAOG,EAAUF,EAAQ,QAAS,CAAE,KAAM,EAAK,CAAC,EAC7C,KACCG,EAAI,IAAMF,EAAW,cAAeF,CAAE,CAAC,EACvCI,EAAIC,IAAY,CAAE,KAAM,UAAUA,EAAQ,SAAS,CAAE,EAAE,CACzD,CACJ,CASO,SAASC,GACdN,EACiC,CACjC,GAAI,CAACO,EAAQ,kBAAkB,GAAK,CAACP,EAAG,kBACtC,OAAOQ,EAGT,GAAI,CAACR,EAAG,OAAQ,CACd,IAAMK,EAAUH,EAAW,cAAeF,CAAE,EACxC,UAAUK,EAAQ,SAAS,IAAM,SAAS,YAAY,IACxDL,EAAG,OAAS,GAChB,CAGA,OAAOS,EAAM,IAAM,CACjB,IAAMC,EAAQ,IAAIC,EAClB,OAAAD,EAAM,UAAU,CAAC,CAAE,KAAAE,CAAK,IAAM,CAC5BZ,EAAG,OAAS,GAGZ,SAAiB,aAAcY,CAAI,CACrC,CAAC,EAGMb,GAAcC,CAAE,EACpB,KACCa,EAAIC,GAASJ,EAAM,KAAKI,CAAK,CAAC,EAC9BC,EAAS,IAAML,EAAM,SAAS,CAAC,EAC/BN,EAAIU,GAAUE,EAAA,CAAE,IAAKhB,GAAOc,EAAQ,CACtC,CACJ,CAAC,CACH,CC5BO,SAASG,GACdC,EAAiB,CAAE,QAAAC,CAAQ,EACN,CACrB,OAAOA,EACJ,KACCC,EAAIC,IAAW,CAAE,OAAQA,IAAWH,CAAG,EAAE,CAC3C,CACJ,CAYO,SAASI,GACdJ,EAAiBK,EACe,CAChC,IAAMC,EAAY,IAAIC,EACtB,OAAAD,EAAU,UAAU,CAAC,CAAE,OAAAE,CAAO,IAAM,CAClCR,EAAG,OAASQ,CACd,CAAC,EAGMT,GAAaC,EAAIK,CAAO,EAC5B,KACCI,EAAIC,GAASJ,EAAU,KAAKI,CAAK,CAAC,EAClCC,EAAS,IAAML,EAAU,SAAS,CAAC,EACnCJ,EAAIQ,GAAUE,EAAA,CAAE,IAAKZ,GAAOU,EAAQ,CACtC,CACJ,CCnEO,SAASG,GACdC,EAAaC,EACA,CACb,OAAIA,IAAU,SAEVC,EAAC,OAAI,MAAM,gCAAgC,GAAIF,EAAI,KAAK,WACtDE,EAAC,OAAI,MAAM,+BAA+B,CAC5C,EAIAA,EAAC,OAAI,MAAM,aAAa,GAAIF,EAAI,KAAK,WACnCE,EAAC,OAAI,MAAM,+BAA+B,CAC5C,CAGN,CAGO,SAASC,MACXC,EACU,CACb,OACEF,EAAC,OAAI,MAAM,cAAc,KAAK,WAC5BA,EAAC,OAAI,MAAM,iCACRE,CACH,CACF,CAEJ,CCvCO,SAASC,GACdC,EAAqBC,EACR,CAIb,GAHAA,EAASA,EAAS,GAAGA,CAAM,eAAeD,CAAE,GAAK,OAG7CC,EAAQ,CACV,IAAMC,EAASD,EAAS,IAAIA,CAAM,GAAK,OACvC,OACEE,EAAC,SAAM,MAAM,gBAAgB,SAAU,GACpCC,GAAcH,CAAM,EACrBE,EAAC,KAAE,KAAMD,EAAQ,MAAM,uBAAuB,SAAU,IACtDC,EAAC,QAAK,wBAAuBH,EAAI,CACnC,CACF,CAEJ,KACE,QACEG,EAAC,SAAM,MAAM,gBAAgB,SAAU,GACpCC,GAAcH,CAAM,EACrBE,EAAC,QAAK,MAAM,uBAAuB,SAAU,IAC3CA,EAAC,QAAK,wBAAuBH,EAAI,CACnC,CACF,CAGN,CC5BO,SAASK,GAAsBC,EAAyB,CAC7D,OACEC,EAAC,UACC,MAAM,uBACN,MAAOC,GAAY,gBAAgB,EACnC,wBAAuB,IAAIF,CAAE,UAC9B,CAEL,CCQA,SAASG,GACPC,EAAsBC,EACT,CACb,IAAMC,EAASD,EAAO,EAChBE,EAASF,EAAO,EAGhBG,EAAU,OAAO,KAAKJ,EAAS,KAAK,EACvC,OAAOK,GAAO,CAACL,EAAS,MAAMK,CAAG,CAAC,EAClC,OAAyB,CAACC,EAAMD,IAAQ,CACvC,GAAGC,EAAMC,EAAC,WAAKF,CAAI,EAAQ,GAC7B,EAAG,CAAC,CAAC,EACJ,MAAM,EAAG,EAAE,EAGRG,EAASC,GAAc,EACvBC,EAAM,IAAI,IAAIV,EAAS,SAAUQ,EAAO,IAAI,EAC9CG,EAAQ,kBAAkB,GAC5BD,EAAI,aAAa,IAAI,IAAK,OAAO,QAAQV,EAAS,KAAK,EACpD,OAAO,CAAC,CAAC,CAAEY,CAAK,IAAMA,CAAK,EAC3B,OAAO,CAACC,EAAW,CAACC,CAAK,IAAM,GAAGD,CAAS,IAAIC,CAAK,GAAG,KAAK,EAAG,EAAE,CACpE,EAGF,GAAM,CAAE,KAAAC,CAAK,EAAIN,GAAc,EAC/B,OACEF,EAAC,KAAE,KAAM,GAAGG,CAAG,GAAI,MAAM,yBAAyB,SAAU,IAC1DH,EAAC,WACC,MAAM,uCACN,gBAAeP,EAAS,MAAM,QAAQ,CAAC,GAEtCE,EAAS,GAAKK,EAAC,OAAI,MAAM,iCAAiC,EAC1DL,EAAS,GAAKK,EAAC,UAAIP,EAAS,KAAM,EAClCE,GAAU,GAAKK,EAAC,UAAIP,EAAS,KAAM,EACnCG,EAAS,GAAKH,EAAS,KAAK,OAAS,GACpCA,EAAS,KAEVA,EAAS,MAAQA,EAAS,KAAK,IAAIgB,GAAO,CACzC,IAAMC,EAAOF,EACTC,KAAOD,EACL,uBAAuBA,EAAKC,CAAG,CAAC,GAChC,cACF,GACJ,OACET,EAAC,QAAK,MAAO,UAAUU,CAAI,IAAKD,CAAI,CAExC,CAAC,EACAb,EAAS,GAAKC,EAAQ,OAAS,GAC9BG,EAAC,KAAE,MAAM,2BACNW,GAAY,4BAA4B,EAAE,KAAG,GAAGd,CACnD,CAEJ,CACF,CAEJ,CAaO,SAASe,GACdC,EACa,CACb,IAAMC,EAAYD,EAAO,CAAC,EAAE,MACtBE,EAAO,CAAC,GAAGF,CAAM,EAEjBZ,EAASC,GAAc,EAGvBP,EAASoB,EAAK,UAAUC,GAErB,CADG,GAAG,IAAI,IAAIA,EAAI,SAAUf,EAAO,IAAI,CAAC,GACrC,SAAS,GAAG,CACvB,EACK,CAACgB,CAAO,EAAIF,EAAK,OAAOpB,EAAQ,CAAC,EAGnCuB,EAAQH,EAAK,UAAUC,GAAOA,EAAI,MAAQF,CAAS,EACnDI,IAAU,KACZA,EAAQH,EAAK,QAGf,IAAMI,EAAOJ,EAAK,MAAM,EAAGG,CAAK,EAC1BE,EAAOL,EAAK,MAAMG,CAAK,EAGvBG,EAAW,CACf7B,GAAqByB,EAAS,EAAc,EAAE,CAACtB,GAAUuB,IAAU,EAAE,EACrE,GAAGC,EAAK,IAAIG,GAAW9B,GAAqB8B,EAAS,CAAW,CAAC,EACjE,GAAGF,EAAK,OAAS,CACfpB,EAAC,WAAQ,MAAM,0BACbA,EAAC,WAAQ,SAAU,IACjBA,EAAC,WACEoB,EAAK,OAAS,GAAKA,EAAK,SAAW,EAChCT,GAAY,wBAAwB,EACpCA,GAAY,2BAA4BS,EAAK,MAAM,CAEzD,CACF,EACC,GAAGA,EAAK,IAAIE,GAAW9B,GAAqB8B,EAAS,CAAW,CAAC,CACpE,CACF,EAAI,CAAC,CACP,EAGA,OACEtB,EAAC,MAAG,MAAM,0BACPqB,CACH,CAEJ,CCrIO,SAASE,GAAkBC,EAAiC,CACjE,OACEC,EAAC,MAAG,MAAM,oBACP,OAAO,QAAQD,CAAK,EAAE,IAAI,CAAC,CAACE,EAAKC,CAAK,IACrCF,EAAC,MAAG,MAAO,oCAAoCC,CAAG,IAC/C,OAAOC,GAAU,SAAWC,GAAMD,CAAK,EAAIA,CAC9C,CACD,CACH,CAEJ,CCAO,SAASE,GACdC,EACa,CACb,IAAMC,EAAU,kCAAkCD,CAAI,GACtD,OACEE,EAAC,OAAI,MAAOD,EAAS,OAAM,IACzBC,EAAC,UAAO,MAAM,gBAAgB,SAAU,GAAI,cAAY,OAAO,CACjE,CAEJ,CCpBO,SAASC,GAAYC,EAAiC,CAC3D,OACEC,EAAC,OAAI,MAAM,0BACTA,EAAC,OAAI,MAAM,qBACRD,CACH,CACF,CAEJ,CCcA,SAASE,GAAcC,EAA+B,CAzDtD,IAAAC,EA0DE,IAAMC,EAASC,GAAc,EAGvBC,EAAM,IAAI,IAAI,MAAMJ,EAAQ,OAAO,IAAKE,EAAO,IAAI,EACzD,OACEG,EAAC,MAAG,MAAM,oBACRA,EAAC,KAAE,KAAM,GAAGD,CAAG,GAAI,MAAM,oBACtBJ,EAAQ,QACRC,EAAAC,EAAO,UAAP,YAAAD,EAAgB,QAASD,EAAQ,QAAQ,OAAS,GACjDK,EAAC,QAAK,MAAM,qBACTL,EAAQ,QAAQ,CAAC,CACpB,CAEJ,CACF,CAEJ,CAcO,SAASM,GACdC,EAAqBC,EACR,CA1Ff,IAAAP,EA2FE,IAAMC,EAASC,GAAc,EAC7B,OAAAI,EAAWA,EAAS,OAAOP,GAAQ,CA5FrC,IAAAC,EA4FwC,SAACA,EAAAD,EAAQ,aAAR,MAAAC,EAAoB,QAAM,EAE/DI,EAAC,OAAI,MAAM,cACTA,EAAC,UACC,MAAM,sBACN,aAAYI,GAAY,gBAAgB,GAEvCD,EAAO,QACPP,EAAAC,EAAO,UAAP,YAAAD,EAAgB,QAASO,EAAO,QAAQ,OAAS,GAChDH,EAAC,QAAK,MAAM,qBACTG,EAAO,QAAQ,CAAC,CACnB,CAEJ,EACAH,EAAC,MAAG,MAAM,oBACPE,EAAS,IAAIR,EAAa,CAC7B,CACF,CAEJ,CCfA,IAAIW,GAAW,EAkBR,SAASC,GACdC,EACqB,CAMrB,IAAMC,EACJC,EAAc,CACZC,GAAkBH,CAAE,EACpBI,GAAkBJ,CAAE,CACtB,CAAC,EACE,KACCK,EAAI,CAAC,CAACC,EAAOC,CAAK,IAAMD,GAASC,CAAK,EACtCC,EAAqB,CACvB,EAMEC,EACJC,EAAM,IAAMC,GAAqBX,CAAE,CAAC,EAAE,KACpCY,GAASC,EAAyB,EAClCC,GAAa,CAAC,EACdT,EAAI,IAAMU,GAAyBf,CAAE,CAAC,CACxC,EAMF,OAAOC,EAAQ,KACbe,GAAMC,GAAUA,CAAM,EACtBC,EAAU,IAAMhB,EAAc,CAACD,EAASQ,CAAO,CAAC,CAAC,EACjDJ,EAAI,CAAC,CAACY,EAAQE,CAAM,KAAO,CAAE,OAAAF,EAAQ,OAAAE,CAAO,EAAE,EAC9CC,GAAM,CACR,CACF,CAoBO,SAASC,GACdrB,EAAiBsB,EACe,CAChC,GAAM,CAAE,SAAAC,EAAU,UAAAC,CAAU,EAAIF,EAI1BG,EAAK,cAAc3B,IAAU,GAGnC,OAAOY,EAAM,IAAM,CACjB,IAAMgB,EAAQ,IAAIC,EAMZC,EAAQ,IAAIC,GAAgB,EAAK,EACvCH,EAAM,KAAKI,EAAe,EAAGC,GAAQ,EAAK,CAAC,EACxC,UAAUH,CAAK,EAUlB,IAAMI,EAAQJ,EAAM,KAClBK,GAAShB,GAAUiB,GAAM,CAAC,CAACjB,EAAS,IAAKkB,EAAc,CAAC,EACxD3B,EAAqB,EACrBU,EAAUD,GAAUA,EAASM,EAAWa,CAAK,EAC7CC,EAAIC,GAAQA,EAAK,GAAKb,CAAE,EACxBL,GAAM,CACR,EAIAlB,EAAc,CACZwB,EAAM,KAAKrB,EAAI,CAAC,CAAE,OAAAY,CAAO,IAAMA,CAAM,CAAC,EACtCe,EAAM,KACJd,EAAUoB,GAAQlC,GAAkBkC,EAAM,GAAG,CAAC,EAC9CC,EAAU,EAAK,CACjB,CACF,CAAC,EACE,KAAKlC,EAAImC,GAAUA,EAAO,KAAKvB,GAAUA,CAAM,CAAC,CAAC,EACjD,UAAUW,CAAK,EAMlB,IAAMa,EAAUb,EAAM,KACpBc,EAAOzB,GAAUA,CAAM,EACvB0B,GAAeX,EAAOR,CAAS,EAC/BnB,EAAI,CAAC,CAACuC,EAAGN,EAAM,CAAE,KAAAO,CAAK,CAAC,IAAM,CAC3B,IAAMC,EAAO9C,EAAG,sBAAsB,EAChC+C,EAAID,EAAK,MAAQ,EAIvB,GAAIR,EAAK,OAAS,UAChB,MAAO,CAAE,EAAAS,EAAG,EAAG,EAAID,EAAK,MAAO,EAI1B,GAAIA,EAAK,GAAKD,EAAK,OAAS,EAAG,CACpC,GAAM,CAAE,OAAAG,CAAO,EAAIC,GAAeX,CAAI,EACtC,MAAO,CAAE,EAAAS,EAAG,EAAG,IAAMC,CAAO,CAC9B,KACE,OAAO,CAAE,EAAAD,EAAG,EAAG,GAAMD,EAAK,MAAO,CAErC,CAAC,CACH,EAIA,OAAA5C,EAAc,CAAC8B,EAAON,EAAOe,CAAO,CAAC,EAClC,UAAU,CAAC,CAACH,EAAM,CAAE,OAAAnB,CAAO,EAAG+B,CAAM,IAAM,CACzCZ,EAAK,MAAM,YAAY,sBAAuB,GAAGnB,EAAO,CAAC,IAAI,EAC7DmB,EAAK,MAAM,YAAY,sBAAuB,GAAGnB,EAAO,CAAC,IAAI,EAI7DmB,EAAK,MAAM,YAAY,iBAAkB,GAAGY,EAAO,CAAC,IAAI,EACxDZ,EAAK,MAAM,YAAY,iBAAkB,GAAGY,EAAO,CAAC,IAAI,EAIxDZ,EAAK,UAAU,OAAO,mBAAuBY,EAAO,EAAK,CAAC,EAC1DZ,EAAK,UAAU,OAAO,sBAAuBY,EAAO,GAAK,CAAC,CAC5D,CAAC,EAIHtB,EAAM,KACJc,EAAOzB,GAAUA,CAAM,EACvB0B,GAAeX,EAAO,CAACY,EAAGN,IAASA,CAAI,EACvCI,EAAOJ,GAAQA,EAAK,OAAS,SAAS,CACxC,EACG,UAAUA,GAAQ,CACjB,IAAMO,EAAOI,GAAeE,EAAW,aAAcb,CAAI,CAAC,EAI1DA,EAAK,MAAM,YAAY,qBAAsB,GAAGO,EAAK,KAAK,IAAI,EAC9DP,EAAK,MAAM,YAAY,oBAAsB,KAAQ,CACvD,CAAC,EAMHV,EAAM,KACJpB,EAAqB,EACrB4C,GAAUC,EAAuB,EACjCV,GAAeX,CAAK,CACtB,EACG,UAAU,CAAC,CAACf,EAAQqB,CAAI,IAAM,CAC7BA,EAAK,UAAU,OAAO,sBAAuBrB,CAAM,CACrD,CAAC,EAGHf,EAAc,CACZ0B,EAAM,KAAKc,EAAOzB,GAAUA,CAAM,CAAC,EACnCe,CACF,CAAC,EACE,UAAU,CAAC,CAACY,EAAGN,CAAI,IAAM,CACpBA,EAAK,OAAS,UAChBtC,EAAG,aAAa,gBAAiByB,CAAE,EACnCzB,EAAG,aAAa,gBAAiB,QAAQ,GAEzCA,EAAG,aAAa,mBAAoByB,CAAE,CAE1C,CAAC,EAGHG,EAAM,KAAKc,EAAOzB,GAAU,CAACA,CAAM,CAAC,EACjC,UAAU,IAAM,CACfjB,EAAG,gBAAgB,eAAe,EAClCA,EAAG,gBAAgB,kBAAkB,EACrCA,EAAG,gBAAgB,eAAe,CACpC,CAAC,EAGID,GAAcC,CAAE,EACpB,KACCqC,EAAIiB,GAAS5B,EAAM,KAAK4B,CAAK,CAAC,EAC9BC,EAAS,IAAM7B,EAAM,SAAS,CAAC,EAC/BrB,EAAIiD,GAAUE,EAAA,CAAE,IAAKxD,GAAOsD,EAAQ,CACtC,CACJ,CAAC,CACH,CAeO,SAASG,GACdzD,EAAiB,CAAE,UAAAwB,CAAU,EAC7BkC,EAAY,SAAS,KACW,CAChC,OAAOrC,GAAcrB,EAAI,CACvB,SAAU,IAAI2D,EAAwBC,GAAY,CAChD,IAAMC,EAAQ7D,EAAG,MACXsC,EAAOwB,GAAqBD,CAAK,EACvC,OAAAD,EAAS,KAAKtB,CAAI,EAClBtC,EAAG,gBAAgB,OAAO,EAE1B0D,EAAU,OAAOpB,CAAI,EACd,IAAM,CACXA,EAAK,OAAO,EACZtC,EAAG,aAAa,QAAS6D,CAAK,CAChC,CACF,CAAC,EACD,UAAArC,CACF,CAAC,CACH,CC3QO,SAASuC,GACdC,EAAiBC,EACO,CACxB,IAAMC,EAAUC,EAAM,IAAMC,EAAc,CACxCC,GAAmBL,CAAE,EACrBM,GAA0BL,CAAS,CACrC,CAAC,CAAC,EACC,KACCM,EAAI,CAAC,CAAC,CAAE,EAAAC,EAAG,EAAAC,CAAE,EAAGC,CAAM,IAAqB,CACzC,GAAM,CAAE,MAAAC,EAAO,OAAAC,CAAO,EAAIC,GAAeb,CAAE,EAC3C,MAAQ,CACN,EAAGQ,EAAIE,EAAO,EAAIC,EAAS,EAC3B,EAAGF,EAAIC,EAAO,EAAIE,EAAS,CAC7B,CACF,CAAC,CACH,EAGF,OAAOE,GAAkBd,CAAE,EACxB,KACCe,EAAUC,GAAUd,EACjB,KACCK,EAAIU,IAAW,CAAE,OAAAD,EAAQ,OAAAC,CAAO,EAAE,EAClCC,GAAK,CAAC,CAACF,GAAU,GAAQ,CAC3B,CACF,CACF,CACJ,CAWO,SAASG,GACdnB,EAAiBC,EAAwB,CAAE,QAAAmB,CAAQ,EAChB,CACnC,GAAM,CAACC,EAASC,CAAK,EAAI,MAAM,KAAKtB,EAAG,QAAQ,EAG/C,OAAOG,EAAM,IAAM,CACjB,IAAMoB,EAAQ,IAAIC,EACZC,EAAQF,EAAM,KAAKG,EAAe,EAAGC,GAAQ,EAAI,CAAC,EACxD,OAAAJ,EAAM,UAAU,CAGd,KAAK,CAAE,OAAAN,CAAO,EAAG,CACfjB,EAAG,MAAM,YAAY,iBAAkB,GAAGiB,EAAO,CAAC,IAAI,EACtDjB,EAAG,MAAM,YAAY,iBAAkB,GAAGiB,EAAO,CAAC,IAAI,CACxD,EAGA,UAAW,CACTjB,EAAG,MAAM,eAAe,gBAAgB,EACxCA,EAAG,MAAM,eAAe,gBAAgB,CAC1C,CACF,CAAC,EAGD4B,GAAuB5B,CAAE,EACtB,KACC6B,EAAUJ,CAAK,CACjB,EACG,UAAUK,GAAW,CACpB9B,EAAG,gBAAgB,kBAAmB8B,CAAO,CAC/C,CAAC,EAGLC,EACER,EAAM,KAAKS,EAAO,CAAC,CAAE,OAAAhB,CAAO,IAAMA,CAAM,CAAC,EACzCO,EAAM,KAAKU,GAAa,GAAG,EAAGD,EAAO,CAAC,CAAE,OAAAhB,CAAO,IAAM,CAACA,CAAM,CAAC,CAC/D,EACG,UAAU,CAGT,KAAK,CAAE,OAAAA,CAAO,EAAG,CACXA,EACFhB,EAAG,QAAQqB,CAAO,EAElBA,EAAQ,OAAO,CACnB,EAGA,UAAW,CACTrB,EAAG,QAAQqB,CAAO,CACpB,CACF,CAAC,EAGHE,EACG,KACCW,GAAU,GAAIC,EAAuB,CACvC,EACG,UAAU,CAAC,CAAE,OAAAnB,CAAO,IAAM,CACzBK,EAAQ,UAAU,OAAO,qBAAsBL,CAAM,CACvD,CAAC,EAGLO,EACG,KACCa,GAAa,IAAKD,EAAuB,EACzCH,EAAO,IAAM,CAAC,CAAChC,EAAG,YAAY,EAC9BO,EAAI,IAAMP,EAAG,aAAc,sBAAsB,CAAC,EAClDO,EAAI,CAAC,CAAE,EAAAC,CAAE,IAAMA,CAAC,CAClB,EACG,UAAU,CAGT,KAAK6B,EAAQ,CACPA,EACFrC,EAAG,MAAM,YAAY,iBAAkB,GAAG,CAACqC,CAAM,IAAI,EAErDrC,EAAG,MAAM,eAAe,gBAAgB,CAC5C,EAGA,UAAW,CACTA,EAAG,MAAM,eAAe,gBAAgB,CAC1C,CACF,CAAC,EAGLsC,EAAsBhB,EAAO,OAAO,EACjC,KACCO,EAAUJ,CAAK,EACfO,EAAOO,GAAM,EAAEA,EAAG,SAAWA,EAAG,QAAQ,CAC1C,EACG,UAAUA,GAAM,CACfA,EAAG,gBAAgB,EACnBA,EAAG,eAAe,CACpB,CAAC,EAGLD,EAAsBhB,EAAO,WAAW,EACrC,KACCO,EAAUJ,CAAK,EACfe,GAAejB,CAAK,CACtB,EACG,UAAU,CAAC,CAACgB,EAAI,CAAE,OAAAvB,CAAO,CAAC,IAAM,CA3OzC,IAAAyB,EA8OU,GAAIF,EAAG,SAAW,GAAKA,EAAG,SAAWA,EAAG,QACtCA,EAAG,eAAe,UAGTvB,EAAQ,CACjBuB,EAAG,eAAe,EAGlB,IAAMG,EAAS1C,EAAG,cAAe,QAAQ,gBAAgB,EACrD0C,aAAkB,YACpBA,EAAO,MAAM,GAEbD,EAAAE,GAAiB,IAAjB,MAAAF,EAAoB,MACxB,CACF,CAAC,EAGLrB,EACG,KACCS,EAAUJ,CAAK,EACfO,EAAOY,GAAUA,IAAWvB,CAAO,EACnCwB,GAAM,GAAG,CACX,EACG,UAAU,IAAM7C,EAAG,MAAM,CAAC,EAGxBD,GAAgBC,EAAIC,CAAS,EACjC,KACC6C,EAAIC,GAASxB,EAAM,KAAKwB,CAAK,CAAC,EAC9BC,EAAS,IAAMzB,EAAM,SAAS,CAAC,EAC/BhB,EAAIwC,GAAUE,EAAA,CAAE,IAAKjD,GAAO+C,EAAQ,CACtC,CACJ,CAAC,CACH,CCxMA,SAASG,GAAUC,EAAuC,CACxD,OAAOA,EAAU,UAAY,OACzBC,EAAY,eAAgBD,CAAS,EACrC,CAACA,CAAS,CAChB,CASA,SAASE,GAAYF,EAAgC,CACnD,IAAMG,EAAkB,CAAC,EACzB,QAAWC,KAAML,GAAUC,CAAS,EAAG,CACrC,IAAMK,EAAgB,CAAC,EAGjBC,EAAK,SAAS,mBAAmBF,EAAI,WAAW,SAAS,EAC/D,QAASG,EAAOD,EAAG,SAAS,EAAGC,EAAMA,EAAOD,EAAG,SAAS,EACtDD,EAAM,KAAKE,CAAY,EAGzB,QAASC,KAAQH,EAAO,CACtB,IAAII,EAGJ,KAAQA,EAAQ,gBAAgB,KAAKD,EAAK,WAAY,GAAI,CACxD,GAAM,CAAC,CAAEE,EAAIC,CAAK,EAAIF,EACtB,GAAI,OAAOE,GAAU,YAAa,CAChC,IAAMC,EAASJ,EAAK,UAAUC,EAAM,KAAK,EACzCD,EAAOI,EAAO,UAAUF,EAAG,MAAM,EACjCP,EAAQ,KAAKS,CAAM,CAGrB,KAAO,CACLJ,EAAK,YAAcE,EACnBP,EAAQ,KAAKK,CAAI,EACjB,KACF,CACF,CACF,CACF,CACA,OAAOL,CACT,CAQA,SAASU,GAAKC,EAAqBC,EAA2B,CAC5DA,EAAO,OAAO,GAAG,MAAM,KAAKD,EAAO,UAAU,CAAC,CAChD,CAoBO,SAASE,GACdZ,EAAiBJ,EAAwB,CAAE,QAAAiB,EAAS,OAAAC,CAAO,EACxB,CAGnC,IAAMC,EAASnB,EAAU,QAAQ,MAAM,EACjCoB,EAASD,GAAA,YAAAA,EAAQ,GAGjBE,EAAc,IAAI,IACxB,QAAWT,KAAUV,GAAYF,CAAS,EAAG,CAC3C,GAAM,CAAC,CAAEU,CAAE,EAAIE,EAAO,YAAa,MAAM,WAAW,EAChDU,GAAmB,yBAAyBZ,CAAE,IAAKN,CAAE,IACvDiB,EAAY,IAAIX,EAAIa,GAAiBb,EAAIU,CAAM,CAAC,EAChDR,EAAO,YAAYS,EAAY,IAAIX,CAAE,CAAE,EAE3C,CAGA,OAAIW,EAAY,OAAS,EAChBG,EAGFC,EAAM,IAAM,CACjB,IAAMC,EAAQ,IAAIC,EACZC,EAAQF,EAAM,KAAKG,EAAe,EAAGC,GAAQ,EAAI,CAAC,EAGlDC,EAAsC,CAAC,EAC7C,OAAW,CAACrB,EAAIsB,CAAU,IAAKX,EAC7BU,EAAM,KAAK,CACTE,EAAW,cAAeD,CAAU,EACpCC,EAAW,yBAAyBvB,CAAE,IAAKN,CAAE,CAC/C,CAAC,EAGH,OAAAc,EAAO,KAAKgB,EAAUN,CAAK,CAAC,EACzB,UAAUO,GAAU,CACnB/B,EAAG,OAAS,CAAC+B,EAGb/B,EAAG,UAAU,OAAO,qBAAsB+B,CAAM,EAGhD,OAAW,CAACC,EAAOC,CAAK,IAAKN,EACtBI,EAGHtB,GAAKuB,EAAOC,CAAK,EAFjBxB,GAAKwB,EAAOD,CAAK,CAGvB,CAAC,EAGIE,EAAM,GAAG,CAAC,GAAGjB,CAAW,EAC5B,IAAI,CAAC,CAAC,CAAEW,CAAU,IACjBO,GAAgBP,EAAYhC,EAAW,CAAE,QAAAiB,CAAQ,CAAC,CACnD,CACH,EACG,KACCuB,EAAS,IAAMd,EAAM,SAAS,CAAC,EAC/Be,GAAM,CACR,CACJ,CAAC,CACH,CC7JA,SAASC,GAASC,EAA0C,CAC1D,GAAIA,EAAG,mBAAoB,CACzB,IAAMC,EAAUD,EAAG,mBACnB,GAAIC,EAAQ,UAAY,KACtB,OAAOA,EAGJ,GAAIA,EAAQ,UAAY,KAAO,CAACA,EAAQ,SAAS,OACpD,OAAOF,GAASE,CAAO,CAC3B,CAIF,CAcO,SAASC,GACdF,EAAiBG,EACkB,CACnC,OAAOC,EAAM,IAAM,CACjB,IAAMC,EAAON,GAASC,CAAE,EACxB,OAAO,OAAOK,GAAS,YACnBC,GAAoBD,EAAML,EAAIG,CAAO,EACrCI,CACN,CAAC,CACH,CCjEA,IAAAC,GAAwB,SA4ExB,IAAIC,GAAW,EAaf,SAASC,GAAkBC,EAA0C,CACnE,GAAIA,EAAG,mBAAoB,CACzB,IAAMC,EAAUD,EAAG,mBACnB,GAAIC,EAAQ,UAAY,KACtB,OAAOA,EAGJ,GAAIA,EAAQ,UAAY,KAAO,CAACA,EAAQ,SAAS,OACpD,OAAOF,GAAkBE,CAAO,CACpC,CAIF,CAgBO,SAASC,GACdF,EACsB,CACtB,OAAOG,GAAiBH,CAAE,EACvB,KACCI,EAAI,CAAC,CAAE,MAAAC,CAAM,KAEJ,CACL,WAFcC,GAAsBN,CAAE,EAElB,MAAQK,CAC9B,EACD,EACDE,EAAwB,YAAY,CACtC,CACJ,CAoBO,SAASC,GACdR,EAAiBS,EACiB,CAClC,GAAM,CAAE,QAASC,CAAM,EAAI,WAAW,SAAS,EAGzCC,EAAWC,EAAM,IAAM,CAC3B,IAAMC,EAAQ,IAAIC,EACZC,EAAQF,EAAM,KAAKG,GAAS,CAAC,CAAC,EACpCH,EAAM,UAAU,CAAC,CAAE,WAAAI,CAAW,IAAM,CAC9BA,GAAcP,EAChBV,EAAG,aAAa,WAAY,GAAG,EAE/BA,EAAG,gBAAgB,UAAU,CACjC,CAAC,EAGD,IAAMkB,EAAoD,CAAC,EAC3D,GAAI,GAAAC,QAAY,YAAY,IACtBnB,EAAG,QAAQ,OAAO,GACpBoB,EAAQ,mBAAmB,GAAK,CAACpB,EAAG,QAAQ,UAAU,GACrD,CACD,IAAMqB,EAASrB,EAAG,QAAQ,KAAK,EAC/BqB,EAAO,GAAK,UAAUvB,IAAU,GAGhC,IAAMwB,EAASC,GAAsBF,EAAO,EAAE,EAC9CA,EAAO,aAAaC,EAAQtB,CAAE,EAC1BoB,EAAQ,kBAAkB,GAC5BF,EAAS,KAAKM,GAAoBF,EAAQ,CAAE,SAAU,CAAC,CAAC,CAC5D,CAIF,IAAMG,EAAYzB,EAAG,QAAQ,YAAY,EACzC,GAAIyB,aAAqB,YAAa,CACpC,IAAMC,EAAO3B,GAAkB0B,CAAS,EAGxC,GAAI,OAAOC,GAAS,cAClBD,EAAU,UAAU,SAAS,UAAU,GACvCL,EAAQ,uBAAuB,GAC9B,CACD,IAAMO,EAAeC,GAAoBF,EAAM1B,EAAIS,CAAO,EAC1DS,EAAS,KACPf,GAAiBsB,CAAS,EACvB,KACCI,EAAUd,CAAK,EACfX,EAAI,CAAC,CAAE,MAAAC,EAAO,OAAAyB,CAAO,IAAMzB,GAASyB,CAAM,EAC1CC,EAAqB,EACrBC,EAAUC,GAAUA,EAASN,EAAeO,CAAK,CACnD,CACJ,CACF,CACF,CAOA,OADcC,EAAY,oBAAqBnC,CAAE,EACvC,QACRA,EAAG,UAAU,IAAI,kBAAkB,EAG9BE,GAAeF,CAAE,EACrB,KACCoC,EAAIC,GAASxB,EAAM,KAAKwB,CAAK,CAAC,EAC9BC,EAAS,IAAMzB,EAAM,SAAS,CAAC,EAC/BT,EAAIiC,GAAUE,EAAA,CAAE,IAAKvC,GAAOqC,EAAQ,EACpCG,GAAU,GAAGtB,CAAQ,CACvB,CACJ,CAAC,EAGD,OAAIE,EAAQ,cAAc,EACjBqB,GAAuBzC,CAAE,EAC7B,KACC0C,EAAOC,GAAWA,CAAO,EACzBC,GAAK,CAAC,EACNZ,EAAU,IAAMrB,CAAQ,CAC1B,EAGGA,CACT,CCnLO,SAASkC,GACdC,EAAwB,CAAE,QAAAC,EAAS,OAAAC,CAAO,EACrB,CACrB,IAAIC,EAAO,GACX,OAAOC,EAGLH,EACG,KACCI,EAAIC,GAAUA,EAAO,QAAQ,qBAAqB,CAAE,EACpDC,EAAOC,GAAWR,IAAOQ,CAAO,EAChCH,EAAI,KAAO,CACT,OAAQ,OAAQ,OAAQ,EAC1B,EAAa,CACf,EAGFH,EACG,KACCK,EAAOE,GAAUA,GAAU,CAACN,CAAI,EAChCO,EAAI,IAAMP,EAAOH,EAAG,IAAI,EACxBK,EAAII,IAAW,CACb,OAAQA,EAAS,OAAS,OAC5B,EAAa,CACf,CACJ,CACF,CAaO,SAASE,GACdX,EAAwBY,EACQ,CAChC,OAAOC,EAAM,IAAM,CACjB,IAAMC,EAAQ,IAAIC,EAClB,OAAAD,EAAM,UAAU,CAAC,CAAE,OAAAE,EAAQ,OAAAC,CAAO,IAAM,CACtCjB,EAAG,gBAAgB,OAAQgB,IAAW,MAAM,EACxCC,GACFjB,EAAG,eAAe,CACtB,CAAC,EAGMD,GAAaC,EAAIY,CAAO,EAC5B,KACCF,EAAIQ,GAASJ,EAAM,KAAKI,CAAK,CAAC,EAC9BC,EAAS,IAAML,EAAM,SAAS,CAAC,EAC/BT,EAAIa,GAAUE,EAAA,CAAE,IAAKpB,GAAOkB,EAAQ,CACtC,CACJ,CAAC,CACH,CCzIA,IAAAG,GAAA,yvLCqDA,IAAIC,GAKAC,GAAW,EAWf,SAASC,IAAiC,CACxC,OAAO,OAAO,SAAY,aAAe,mBAAmB,QACxDC,GAAY,kDAAkD,EAC9DC,EAAG,MAAS,CAClB,CAaO,SAASC,GACdC,EACgC,CAChC,OAAAA,EAAG,UAAU,OAAO,SAAS,EAC7BN,QAAaE,GAAa,EACvB,KACCK,EAAI,IAAM,QAAQ,WAAW,CAC3B,YAAa,GACb,SAAAC,GACA,SAAU,CACR,cAAe,OACf,gBAAiB,OACjB,aAAc,MAChB,CACF,CAAC,CAAC,EACFC,EAAI,IAAG,EAAY,EACnBC,EAAY,CAAC,CACf,GAGFV,GAAS,UAAU,IAAYW,GAAA,sBAC7BL,EAAG,UAAU,IAAI,SAAS,EAC1B,IAAMM,EAAK,aAAaX,IAAU,GAG5BY,EAAOC,EAAE,MAAO,CAAE,MAAO,SAAU,CAAC,EACpCC,EAAOT,EAAG,YAGV,CAAE,IAAAU,EAAK,GAAAC,CAAG,EAAI,MAAM,QAAQ,OAAOL,EAAIG,CAAI,EAG3CG,EAASL,EAAK,aAAa,CAAE,KAAM,QAAS,CAAC,EACnDK,EAAO,UAAYF,EAGnBV,EAAG,YAAYO,CAAI,EACnBI,GAAA,MAAAA,EAAKC,EACP,EAAC,EAGMlB,GACJ,KACCS,EAAI,KAAO,CAAE,IAAKH,CAAG,EAAE,CACzB,CACJ,CCtFA,IAAMa,GAAWC,EAAE,OAAO,EAgBnB,SAASC,GACdC,EACkC,CAClC,OAAAA,EAAG,YAAYH,EAAQ,EACvBA,GAAS,YAAYI,GAAYD,CAAE,CAAC,EAG7BE,EAAG,CAAE,IAAKF,CAAG,CAAC,CACvB,CC4BO,SAASG,GACdC,EACyB,CACzB,IAAMC,EAAUD,EAAO,KAAKE,GAASA,EAAM,OAAO,GAAKF,EAAO,CAAC,EAC/D,OAAOG,EAAM,GAAGH,EAAO,IAAIE,GAASE,EAAUF,EAAO,QAAQ,EAC1D,KACCG,EAAI,IAAMC,EAA6B,cAAcJ,EAAM,EAAE,IAAI,CAAC,CACpE,CACF,CAAC,EACE,KACCK,EAAUD,EAA6B,cAAcL,EAAQ,EAAE,IAAI,CAAC,EACpEI,EAAIG,IAAW,CAAE,OAAAA,CAAO,EAAE,CAC5B,CACJ,CAUO,SAASC,GACdC,EAAiB,CAAE,UAAAC,EAAW,QAAAC,CAAQ,EACF,CACpC,IAAMC,EAAYP,EAAW,iBAAkBI,CAAE,EAC3CV,EAASc,EAA8B,iBAAkBJ,CAAE,EAG3DK,EAAOC,GAAoB,MAAM,EACvCN,EAAG,OAAOK,CAAI,EAGd,IAAME,EAAOD,GAAoB,MAAM,EACvC,OAAAN,EAAG,OAAOO,CAAI,EAGPC,EAAM,IAAM,CACjB,IAAMC,EAAQ,IAAIC,EACZC,EAAQF,EAAM,KAAKG,EAAe,EAAGC,GAAQ,EAAI,CAAC,EACxDC,EAAc,CAACL,EAAOM,GAAiBf,CAAE,EAAGgB,GAAuBhB,CAAE,CAAC,CAAC,EACpE,KACCiB,EAAUN,CAAK,EACfO,GAAU,EAAGC,EAAuB,CACtC,EACG,UAAU,CAGT,KAAK,CAAC,CAAE,OAAArB,CAAO,EAAGsB,CAAI,EAAG,CACvB,IAAMC,EAASC,GAAiBxB,CAAM,EAChC,CAAE,MAAAyB,CAAM,EAAIC,GAAe1B,CAAM,EAGvCE,EAAG,MAAM,YAAY,mBAAoB,GAAGqB,EAAO,CAAC,IAAI,EACxDrB,EAAG,MAAM,YAAY,uBAAwB,GAAGuB,CAAK,IAAI,EAGzD,IAAME,EAAUC,GAAwBvB,CAAS,GAE/CkB,EAAO,EAAYI,EAAQ,GAC3BJ,EAAO,EAAIE,EAAQE,EAAQ,EAAIL,EAAK,QAEpCjB,EAAU,SAAS,CACjB,KAAM,KAAK,IAAI,EAAGkB,EAAO,EAAI,EAAE,EAC/B,SAAU,QACZ,CAAC,CACL,EAGA,UAAW,CACTrB,EAAG,MAAM,eAAe,kBAAkB,EAC1CA,EAAG,MAAM,eAAe,sBAAsB,CAChD,CACF,CAAC,EAGLc,EAAc,CACZa,GAA0BxB,CAAS,EACnCY,GAAiBZ,CAAS,CAC5B,CAAC,EACE,KACCc,EAAUN,CAAK,CACjB,EACG,UAAU,CAAC,CAACU,EAAQD,CAAI,IAAM,CAC7B,IAAMK,EAAUG,GAAsBzB,CAAS,EAC/CE,EAAK,OAASgB,EAAO,EAAI,GACzBd,EAAK,OAASc,EAAO,EAAII,EAAQ,MAAQL,EAAK,MAAQ,EACxD,CAAC,EAGL3B,EACEC,EAAUW,EAAM,OAAO,EAAE,KAAKV,EAAI,IAAM,EAAE,CAAC,EAC3CD,EAAUa,EAAM,OAAO,EAAE,KAAKZ,EAAI,IAAM,CAAE,CAAC,CAC7C,EACG,KACCsB,EAAUN,CAAK,CACjB,EACG,UAAUkB,GAAa,CACtB,GAAM,CAAE,MAAAN,CAAM,EAAIC,GAAerB,CAAS,EAC1CA,EAAU,SAAS,CACjB,KAAMoB,EAAQM,EACd,SAAU,QACZ,CAAC,CACH,CAAC,EAGL3B,EACG,KACCe,EAAUN,CAAK,EACfmB,EAAOtC,GAASF,EAAO,SAASE,CAAyB,CAAC,CAC5D,EACG,UAAUA,GAASA,EAAM,MAAM,CAAC,EAGrCW,EAAU,UAAU,IAAI,uBAAuB,EAC/C,QAAWX,KAASF,EAAQ,CAC1B,IAAMyC,EAAQnC,EAA6B,cAAcJ,EAAM,EAAE,IAAI,EACrEuC,EAAM,gBAAgBC,EAAE,IAAK,CAC3B,KAAM,IAAID,EAAM,OAAO,GACvB,SAAU,EACZ,EAAG,GAAG,MAAM,KAAKA,EAAM,UAAU,CAAC,CAAC,EAGnCrC,EAAsBqC,EAAM,kBAAoB,OAAO,EACpD,KACCd,EAAUN,CAAK,EACfmB,EAAOG,GAAM,EAAEA,EAAG,SAAWA,EAAG,QAAQ,EACxCC,EAAID,GAAM,CACRA,EAAG,eAAe,EAClBA,EAAG,gBAAgB,CACrB,CAAC,CACH,EAEG,UAAU,IAAM,CACf,QAAQ,aAAa,CAAC,EAAG,GAAI,IAAIF,EAAM,OAAO,EAAE,EAChDA,EAAM,MAAM,CACd,CAAC,CACP,CAGA,OAAII,EAAQ,mBAAmB,GAC7B1B,EAAM,KACJ2B,GAAK,CAAC,EACNC,GAAepC,CAAS,CAC1B,EACG,UAAU,CAAC,CAAC,CAAE,OAAAH,CAAO,EAAG,CAAE,OAAAuB,CAAO,CAAC,IAAM,CACvC,IAAMiB,EAAMxC,EAAO,UAAU,KAAK,EAClC,GAAIA,EAAO,aAAa,mBAAmB,EACzCA,EAAO,gBAAgB,mBAAmB,MAGrC,CACL,IAAMyC,EAAIvC,EAAG,UAAYqB,EAAO,EAGhC,QAAWmB,KAAOpC,EAAY,aAAa,EACzC,QAAWZ,KAASY,EAClB,iBAAkBoC,CACpB,EAAG,CACD,IAAMT,GAAQnC,EAAW,cAAcJ,EAAM,EAAE,IAAI,EACnD,GACEuC,KAAUjC,GACViC,GAAM,UAAU,KAAK,IAAMO,EAC3B,CACAP,GAAM,aAAa,oBAAqB,EAAE,EAC1CvC,EAAM,MAAM,EACZ,KACF,CACF,CAGF,OAAO,SAAS,CACd,IAAKQ,EAAG,UAAYuC,CACtB,CAAC,EAGD,IAAME,EAAO,SAAmB,QAAQ,GAAK,CAAC,EAC9C,SAAS,SAAU,CAAC,GAAG,IAAI,IAAI,CAACH,EAAK,GAAGG,CAAI,CAAC,CAAC,CAAC,CACjD,CACF,CAAC,EAGLhC,EAAM,KAAKQ,EAAUN,CAAK,CAAC,EACxB,UAAU,IAAM,CACf,QAAW+B,KAAStC,EAA8B,eAAgBJ,CAAE,EAClE0C,EAAM,MAAM,CAChB,CAAC,EAGIrD,GAAiBC,CAAM,EAC3B,KACC4C,EAAIS,GAASlC,EAAM,KAAKkC,CAAK,CAAC,EAC9BC,EAAS,IAAMnC,EAAM,SAAS,CAAC,EAC/Bd,EAAIgD,GAAUE,EAAA,CAAE,IAAK7C,GAAO2C,EAAQ,CACtC,CACJ,CAAC,EACE,KACCG,GAAYC,EAAc,CAC5B,CACJ,CCpMO,SAASC,GACdC,EAAiB,CAAE,UAAAC,EAAW,QAAAC,EAAS,OAAAC,CAAO,EACd,CAChC,OAAOC,EAGL,GAAGC,EAAY,4BAA6BL,CAAE,EAC3C,IAAIM,GAASC,GAAqBD,EAAO,CAAE,QAAAJ,EAAS,OAAAC,CAAO,CAAC,CAAC,EAGhE,GAAGE,EAAY,2BAA4BL,CAAE,EAC1C,IAAIM,GAASE,GAAeF,EAAO,CAAE,QAAAJ,EAAS,OAAAC,CAAO,CAAC,CAAC,EAG1D,GAAGE,EAAY,cAAeL,CAAE,EAC7B,IAAIM,GAASG,GAAaH,CAAK,CAAC,EAGnC,GAAGD,EAAY,qBAAsBL,CAAE,EACpC,IAAIM,GAASI,GAAeJ,CAAK,CAAC,EAGrC,GAAGD,EAAY,UAAWL,CAAE,EACzB,IAAIM,GAASK,GAAaL,EAAO,CAAE,QAAAJ,EAAS,OAAAC,CAAO,CAAC,CAAC,EAGxD,GAAGE,EAAY,cAAeL,CAAE,EAC7B,IAAIM,GAASM,GAAiBN,EAAO,CAAE,UAAAL,EAAW,QAAAC,CAAQ,CAAC,CAAC,EAG/D,GAAGG,EAAY,UAAWL,CAAE,EACzB,OAAO,IAAMa,EAAQ,kBAAkB,CAAC,EACxC,IAAIP,GAASQ,GAAoBR,EAAO,CAAE,UAAAL,CAAU,CAAC,CAAC,CAC3D,CACF,CCtDO,SAASc,GACdC,EAAkB,CAAE,OAAAC,CAAO,EACP,CACpB,OAAOA,EACJ,KACCC,EAAUC,GAAWC,EACnBC,EAAG,EAAI,EACPA,EAAG,EAAK,EAAE,KAAKC,GAAM,GAAI,CAAC,CAC5B,EACG,KACCC,EAAIC,IAAW,CAAE,QAAAL,EAAS,OAAAK,CAAO,EAAE,CACrC,CACF,CACF,CACJ,CAaO,SAASC,GACdC,EAAiBC,EACc,CAC/B,IAAMC,EAAQC,EAAW,cAAeH,CAAE,EAC1C,OAAOI,EAAM,IAAM,CACjB,IAAMC,EAAQ,IAAIC,EAClB,OAAAD,EAAM,UAAU,CAAC,CAAE,QAAAZ,EAAS,OAAAK,CAAO,IAAM,CACvCE,EAAG,UAAU,OAAO,oBAAqBF,CAAM,EAC/CI,EAAM,YAAcT,CACtB,CAAC,EAGMJ,GAAYW,EAAIC,CAAO,EAC3B,KACCM,EAAIC,GAASH,EAAM,KAAKG,CAAK,CAAC,EAC9BC,EAAS,IAAMJ,EAAM,SAAS,CAAC,EAC/BR,EAAIW,GAAUE,EAAA,CAAE,IAAKV,GAAOQ,EAAQ,CACtC,CACJ,CAAC,CACH,CCnDA,IAAIG,GAAW,EAiBR,SAASC,GACdC,EAAiBC,EACI,CACrB,SAAS,KAAK,OAAOD,CAAE,EAGvB,GAAM,CAAE,MAAAE,CAAM,EAAIC,GAAeH,CAAE,EACnCA,EAAG,MAAM,YAAY,qBAAsB,GAAGE,CAAK,IAAI,EACvDF,EAAG,OAAO,EAGV,IAAMI,EAAYC,GAAoBJ,CAAI,EACpCK,EACJ,OAAOF,GAAc,YACjBG,GAA0BH,CAAS,EACnCI,EAAG,CAAE,EAAG,EAAG,EAAG,CAAE,CAAC,EAGjBC,EAAUC,EACdC,GAAkBV,CAAI,EACtBW,GAAkBX,CAAI,CACxB,EACG,KACCY,EAAqB,CACvB,EAGF,OAAOC,EAAc,CAACL,EAASH,CAAO,CAAC,EACpC,KACCS,EAAI,CAAC,CAACC,EAAQC,CAAM,IAAM,CACxB,GAAI,CAAE,EAAAC,EAAG,EAAAC,CAAE,EAAIC,GAAiBnB,CAAI,EAC9BoB,EAAOlB,GAAeF,CAAI,EAU1BqB,EAAQrB,EAAK,QAAQ,OAAO,EAClC,OAAIqB,GAASrB,EAAK,gBAChBiB,GAAKI,EAAM,WAAarB,EAAK,cAAc,WAC3CkB,GAAKG,EAAM,UAAarB,EAAK,cAAc,WAEtC,CACL,OAAAe,EACA,OAAQ,CACN,EAAGE,EAAID,EAAO,EAAII,EAAK,MAAS,EAAInB,EAAQ,EAC5C,EAAGiB,EAAIF,EAAO,EAAII,EAAK,OAAS,CAClC,CACF,CACF,CAAC,CACH,CACJ,CASO,SAASE,GACdvB,EACgC,CAChC,IAAMwB,EAAQxB,EAAG,MACjB,GAAI,CAACwB,EAAM,OACT,OAAOC,EAGT,IAAMC,EAAK,aAAa5B,IAAU,GAC5B6B,EAAUC,GAAcF,EAAI,QAAQ,EACpCG,EAAUC,EAAW,cAAeH,CAAO,EACjD,OAAAE,EAAQ,UAAYL,EAGbO,EAAM,IAAM,CACjB,IAAMC,EAAQ,IAAIC,EAClB,OAAAD,EAAM,UAAU,CAGd,KAAK,CAAE,OAAAE,CAAO,EAAG,CACfP,EAAQ,MAAM,YAAY,iBAAkB,GAAGO,EAAO,CAAC,IAAI,EAC3DP,EAAQ,MAAM,YAAY,iBAAkB,GAAGO,EAAO,CAAC,IAAI,CAC7D,EAGA,UAAW,CACTP,EAAQ,MAAM,eAAe,gBAAgB,EAC7CA,EAAQ,MAAM,eAAe,gBAAgB,CAC/C,CACF,CAAC,EAGDjB,EACEsB,EAAM,KAAKG,EAAO,CAAC,CAAE,OAAAnB,CAAO,IAAMA,CAAM,CAAC,EACzCgB,EAAM,KAAKI,GAAa,GAAG,EAAGD,EAAO,CAAC,CAAE,OAAAnB,CAAO,IAAM,CAACA,CAAM,CAAC,CAC/D,EACG,UAAU,CAGT,KAAK,CAAE,OAAAA,CAAO,EAAG,CACXA,GACFhB,EAAG,sBAAsB,WAAY2B,CAAO,EAC5C3B,EAAG,aAAa,mBAAoB0B,CAAE,EACtC1B,EAAG,gBAAgB,OAAO,IAE1B2B,EAAQ,OAAO,EACf3B,EAAG,gBAAgB,kBAAkB,EACrCA,EAAG,aAAa,QAASwB,CAAK,EAElC,EAGA,UAAW,CACTG,EAAQ,OAAO,EACf3B,EAAG,gBAAgB,kBAAkB,EACrCA,EAAG,aAAa,QAASwB,CAAK,CAChC,CACF,CAAC,EAGHQ,EACG,KACCK,GAAU,GAAIC,EAAuB,CACvC,EACG,UAAU,CAAC,CAAE,OAAAtB,CAAO,IAAM,CACzBW,EAAQ,UAAU,OAAO,qBAAsBX,CAAM,CACvD,CAAC,EAMLgB,EACG,KACCO,GAAa,IAAKD,EAAuB,EACzCH,EAAO,IAAM,CAAC,CAACnC,EAAG,YAAY,EAC9Be,EAAI,IAAMf,EAAG,aAAc,sBAAsB,CAAC,EAClDe,EAAI,CAAC,CAAE,EAAAG,CAAE,IAAMA,CAAC,CAClB,EACC,UAAU,CAGT,KAAKsB,EAAQ,CACPA,EACFb,EAAQ,MAAM,YAAY,iBAAkB,GAAG,CAACa,CAAM,IAAI,EAE1Db,EAAQ,MAAM,eAAe,gBAAgB,CACjD,EAGA,UAAW,CACTA,EAAQ,MAAM,eAAe,gBAAgB,CAC/C,CACF,CAAC,EAGI5B,GAAa4B,EAAS3B,CAAE,EAC5B,KACCyC,EAAIC,GAASV,EAAM,KAAKU,CAAK,CAAC,EAC9BC,EAAS,IAAMX,EAAM,SAAS,CAAC,EAC/BjB,EAAI2B,GAAUE,EAAA,CAAE,IAAK5C,GAAO0C,EAAQ,CACtC,CACJ,CAAC,EACE,KACCG,GAAYC,EAAc,CAC5B,CACJ,CC7JA,SAASC,GAAS,CAAE,UAAAC,CAAU,EAAsC,CAClE,GAAI,CAACC,EAAQ,iBAAiB,EAC5B,OAAOC,EAAG,EAAK,EAGjB,IAAMC,EAAaH,EAChB,KACCI,EAAI,CAAC,CAAE,OAAQ,CAAE,EAAAC,CAAE,CAAE,IAAMA,CAAC,EAC5BC,GAAY,EAAG,CAAC,EAChBF,EAAI,CAAC,CAACG,EAAGC,CAAC,IAAM,CAACD,EAAIC,EAAGA,CAAC,CAAU,EACnCC,EAAwB,CAAC,CAC3B,EAGIC,EAAUC,EAAc,CAACX,EAAWG,CAAU,CAAC,EAClD,KACCS,EAAO,CAAC,CAAC,CAAE,OAAAC,CAAO,EAAG,CAAC,CAAER,CAAC,CAAC,IAAM,KAAK,IAAIA,EAAIQ,EAAO,CAAC,EAAI,GAAG,EAC5DT,EAAI,CAAC,CAAC,CAAE,CAACU,CAAS,CAAC,IAAMA,CAAS,EAClCC,EAAqB,CACvB,EAGIC,EAAUC,GAAY,QAAQ,EACpC,OAAON,EAAc,CAACX,EAAWgB,CAAO,CAAC,EACtC,KACCZ,EAAI,CAAC,CAAC,CAAE,OAAAS,CAAO,EAAGK,CAAM,IAAML,EAAO,EAAI,KAAO,CAACK,CAAM,EACvDH,EAAqB,EACrBI,EAAUC,GAAUA,EAASV,EAAUR,EAAG,EAAK,CAAC,EAChDmB,EAAU,EAAK,CACjB,CACJ,CAcO,SAASC,GACdC,EAAiBC,EACG,CACpB,OAAOC,EAAM,IAAMd,EAAc,CAC/Be,GAAiBH,CAAE,EACnBxB,GAASyB,CAAO,CAClB,CAAC,CAAC,EACC,KACCpB,EAAI,CAAC,CAAC,CAAE,OAAAuB,CAAO,EAAGC,CAAM,KAAO,CAC7B,OAAAD,EACA,OAAAC,CACF,EAAE,EACFb,EAAqB,CAACR,EAAGC,IACvBD,EAAE,SAAWC,EAAE,QACfD,EAAE,SAAWC,EAAE,MAChB,EACDqB,EAAY,CAAC,CACf,CACJ,CAaO,SAASC,GACdP,EAAiB,CAAE,QAAAQ,EAAS,MAAAC,CAAM,EACO,CACzC,OAAOP,EAAM,IAAM,CACjB,IAAMQ,EAAQ,IAAIC,EACZC,EAAQF,EAAM,KAAKG,EAAe,EAAGC,GAAQ,EAAI,CAAC,EACxDJ,EACG,KACCxB,EAAwB,QAAQ,EAChC6B,GAAkBP,CAAO,CAC3B,EACG,UAAU,CAAC,CAAC,CAAE,OAAAX,CAAO,EAAG,CAAE,OAAAQ,CAAO,CAAC,IAAM,CACvCL,EAAG,UAAU,OAAO,oBAAqBH,GAAU,CAACQ,CAAM,EAC1DL,EAAG,OAASK,CACd,CAAC,EAGL,IAAMW,EAAWC,GAAKC,EAAY,UAAWlB,CAAE,CAAC,EAC7C,KACCX,EAAO,IAAMX,EAAQ,kBAAkB,CAAC,EACxCyC,GAASC,GAASC,GAAaD,CAAK,CAAC,CACvC,EAGF,OAAAX,EAAM,UAAUC,CAAK,EAGdF,EACJ,KACCc,EAAUV,CAAK,EACf/B,EAAI0C,GAAUC,EAAA,CAAE,IAAKxB,GAAOuB,EAAQ,EACpCE,GAAUT,EAAS,KAAKM,EAAUV,CAAK,CAAC,CAAC,CAC3C,CACJ,CAAC,CACH,CCjIO,SAASc,GACdC,EAAiB,CAAE,UAAAC,EAAW,QAAAC,CAAQ,EACb,CACzB,OAAOC,GAAgBH,EAAI,CAAE,UAAAC,EAAW,QAAAC,CAAQ,CAAC,EAC9C,KACCE,EAAI,CAAC,CAAE,OAAQ,CAAE,EAAAC,CAAE,CAAE,IAAM,CACzB,GAAM,CAAE,OAAAC,CAAO,EAAIC,GAAeP,CAAE,EACpC,MAAO,CACL,OAAQK,GAAKC,CACf,CACF,CAAC,EACDE,EAAwB,QAAQ,CAClC,CACJ,CAaO,SAASC,GACdT,EAAiBU,EACmB,CACpC,OAAOC,EAAM,IAAM,CACjB,IAAMC,EAAQ,IAAIC,EAClBD,EAAM,UAAU,CAGd,KAAK,CAAE,OAAAE,CAAO,EAAG,CACfd,EAAG,UAAU,OAAO,2BAA4Bc,CAAM,CACxD,EAGA,UAAW,CACTd,EAAG,UAAU,OAAO,0BAA0B,CAChD,CACF,CAAC,EAGD,IAAMe,EAAUC,GAAmB,gBAAgB,EACnD,OAAI,OAAOD,GAAY,YACdE,EAGFlB,GAAiBgB,EAASL,CAAO,EACrC,KACCQ,EAAIC,GAASP,EAAM,KAAKO,CAAK,CAAC,EAC9BC,EAAS,IAAMR,EAAM,SAAS,CAAC,EAC/BR,EAAIe,GAAUE,EAAA,CAAE,IAAKrB,GAAOmB,EAAQ,CACtC,CACJ,CAAC,CACH,CChEO,SAASG,GACdC,EAAiB,CAAE,UAAAC,EAAW,QAAAC,CAAQ,EACpB,CAGlB,IAAMC,EAAUD,EACb,KACCE,EAAI,CAAC,CAAE,OAAAC,CAAO,IAAMA,CAAM,EAC1BC,EAAqB,CACvB,EAGIC,EAAUJ,EACb,KACCK,EAAU,IAAMC,GAAiBT,CAAE,EAChC,KACCI,EAAI,CAAC,CAAE,OAAAC,CAAO,KAAO,CACnB,IAAQL,EAAG,UACX,OAAQA,EAAG,UAAYK,CACzB,EAAE,EACFK,EAAwB,QAAQ,CAClC,CACF,CACF,EAGF,OAAOC,EAAc,CAACR,EAASI,EAASN,CAAS,CAAC,EAC/C,KACCG,EAAI,CAAC,CAACQ,EAAQ,CAAE,IAAAC,EAAK,OAAAC,CAAO,EAAG,CAAE,OAAQ,CAAE,EAAAC,CAAE,EAAG,KAAM,CAAE,OAAAV,CAAO,CAAE,CAAC,KAChEA,EAAS,KAAK,IAAI,EAAGA,EACjB,KAAK,IAAI,EAAGQ,EAASE,EAAIH,CAAM,EAC/B,KAAK,IAAI,EAAGP,EAASU,EAAID,CAAM,CACnC,EACO,CACL,OAAQD,EAAMD,EACd,OAAAP,EACA,OAAQQ,EAAMD,GAAUG,CAC1B,EACD,EACDT,EAAqB,CAACU,EAAGC,IACvBD,EAAE,SAAWC,EAAE,QACfD,EAAE,SAAWC,EAAE,QACfD,EAAE,SAAWC,EAAE,MAChB,CACH,CACJ,CCxCO,SAASC,GACdC,EACqB,CACrB,IAAMC,EAAU,SAAkB,WAAW,GAAK,CAChD,MAAOD,EAAO,UAAUE,GAAS,WAC/BA,EAAM,aAAa,qBAAqB,CAC1C,EAAE,OAAO,CACX,EAGMC,EAAQ,KAAK,IAAI,EAAG,KAAK,IAAIF,EAAQ,MAAOD,EAAO,OAAS,CAAC,CAAC,EACpE,OAAOI,EAAG,GAAGJ,CAAM,EAChB,KACCK,GAASH,GAASI,EAAUJ,EAAO,QAAQ,EAAE,KAAKK,EAAI,IAAML,CAAK,CAAC,CAAC,EACnEM,EAAUR,EAAOG,CAAK,CAAC,EACvBI,EAAIL,IAAU,CACZ,MAAOF,EAAO,QAAQE,CAAK,EAC3B,MAAO,CACL,MAASA,EAAM,aAAa,qBAAqB,EACjD,OAASA,EAAM,aAAa,sBAAsB,EAClD,QAASA,EAAM,aAAa,uBAAuB,EACnD,OAASA,EAAM,aAAa,sBAAsB,CACpD,CACF,EAAa,EACbO,EAAY,CAAC,CACf,CACJ,CASO,SAASC,GACdC,EACgC,CAChC,IAAMX,EAASY,EAA8B,QAASD,CAAE,EAClDE,EAAOC,EAAE,OAAQ,CAAE,KAAM,aAAc,CAAC,EAC9C,SAAS,KAAK,YAAYD,CAAI,EAG9B,IAAME,EAASD,EAAE,OAAQ,CAAE,KAAM,cAAe,CAAC,EACjD,SAAS,KAAK,YAAYC,CAAM,EAGhC,IAAMC,EAASC,GAAW,+BAA+B,EACzD,OAAOC,EAAM,IAAM,CACjB,IAAMC,EAAQ,IAAIC,EAClB,OAAAD,EAAM,UAAUE,GAAW,CAIzB,GAHA,SAAS,KAAK,aAAa,0BAA2B,EAAE,EAGpDA,EAAQ,MAAM,QAAU,yBAA0B,CACpD,IAAMC,EAAQ,WAAW,+BAA+B,EAClDpB,EAAQ,SAAS,cAAcoB,EAAM,QACvC,wDACA,sDACJ,EAGAD,EAAQ,MAAM,OAAUnB,EAAM,aAAa,sBAAsB,EACjEmB,EAAQ,MAAM,QAAUnB,EAAM,aAAa,uBAAuB,EAClEmB,EAAQ,MAAM,OAAUnB,EAAM,aAAa,sBAAsB,CACnE,CAGA,OAAW,CAACqB,EAAKC,CAAK,IAAK,OAAO,QAAQH,EAAQ,KAAK,EACrD,SAAS,KAAK,aAAa,iBAAiBE,CAAG,GAAIC,CAAK,EAG1D,QAASrB,EAAQ,EAAGA,EAAQH,EAAO,OAAQG,IAAS,CAClD,IAAMsB,EAAQzB,EAAOG,CAAK,EAAE,mBACxBsB,aAAiB,cACnBA,EAAM,OAASJ,EAAQ,QAAUlB,EACrC,CAGA,SAAS,YAAakB,CAAO,CAC/B,CAAC,EAGDf,EAAyBK,EAAI,SAAS,EAAE,KACtCe,EAAOC,GAAMA,EAAG,MAAQ,OAAO,EAC/BC,GAAeT,EAAO,CAACU,EAAGR,IAAYA,CAAO,CAC/C,EACG,UAAU,CAAC,CAAE,MAAAlB,CAAM,IAAM,CACxBA,GAASA,EAAQ,GAAKH,EAAO,OAC7BA,EAAOG,CAAK,EAAE,MAAM,EACpBH,EAAOG,CAAK,EAAE,MAAM,CACtB,CAAC,EAGHgB,EACG,KACCZ,EAAI,IAAM,CACR,IAAMuB,EAASC,GAAoB,QAAQ,EACrCC,EAAS,OAAO,iBAAiBF,CAAM,EAG7C,OAAAf,EAAO,QAAUiB,EAAM,YAGhBA,EAAM,gBAAgB,MAAM,MAAM,EACtC,IAAIR,IAAU,CAACA,GAAO,SAAS,EAAE,EAAE,SAAS,EAAG,GAAG,CAAC,EACnD,KAAK,EAAE,CACZ,CAAC,CACH,EACG,UAAUS,GAASpB,EAAK,QAAU,IAAIoB,CAAK,EAAE,EAGlDd,EAAM,KAAKe,GAAUC,EAAc,CAAC,EACjC,UAAU,IAAM,CACf,SAAS,KAAK,gBAAgB,yBAAyB,CACzD,CAAC,EAGIpC,GAAaC,CAAM,EACvB,KACCoC,EAAUpB,EAAO,KAAKqB,GAAK,CAAC,CAAC,CAAC,EAC9BC,GAAO,EACPC,EAAIC,GAASrB,EAAM,KAAKqB,CAAK,CAAC,EAC9BC,EAAS,IAAMtB,EAAM,SAAS,CAAC,EAC/BZ,EAAIiC,GAAUE,EAAA,CAAE,IAAK/B,GAAO6B,EAAQ,CACtC,CACJ,CAAC,CACH,CChJO,SAASG,GACdC,EAAiB,CAAE,UAAAC,CAAU,EACI,CAGjC,OAAOC,EAAM,IAAM,CACjB,IAAMC,EAAQ,IAAIC,EAClB,OAAAD,EAAM,UAAU,CAAC,CAAE,MAAAE,CAAM,IAAM,CAC7BL,EAAG,MAAM,YAAY,sBAAuB,GAAGK,CAAK,EAAE,CACxD,CAAC,EAGMJ,EACJ,KACCK,EAAID,GAASF,EAAM,KAAK,CAAE,MAAAE,CAAM,CAAC,CAAC,EAClCE,EAAS,IAAMJ,EAAM,SAAS,CAAC,EAC/BK,EAAIH,IAAU,CAAE,IAAKL,EAAI,MAAAK,CAAM,EAAE,CACnC,CACJ,CAAC,CACH,CChEA,IAAAI,GAAwB,SAiCxB,SAASC,GAAQC,EAAyB,CACxCA,EAAG,aAAa,kBAAmB,EAAE,EACrC,IAAMC,EAAOD,EAAG,QAAQ,aAAa,EAC/BE,EAAOD,EACTA,EAAK,aAAa,WAAW,EAC7BD,EAAG,UACP,OAAAA,EAAG,gBAAgB,iBAAiB,EAC7BE,EAAK,QAAQ,CACtB,CAWO,SAASC,GACd,CAAE,OAAAC,CAAO,EACH,CACF,GAAAC,QAAY,YAAY,GAC1B,IAAIC,EAA8BC,GAAc,CAC9C,IAAI,GAAAF,QAAY,iDAAkD,CAChE,KAAML,GACJA,EAAG,aAAa,qBAAqB,GACrCD,GAAQS,EACNR,EAAG,aAAa,uBAAuB,CACzC,CAAC,CAEL,CAAC,EACE,GAAG,UAAWS,GAAMF,EAAW,KAAKE,CAAE,CAAC,CAC5C,CAAC,EACE,KACCC,EAAID,GAAM,CACQA,EAAG,QACX,MAAM,CAChB,CAAC,EACDE,EAAI,IAAMC,GAAY,kBAAkB,CAAC,CAC3C,EACG,UAAUR,CAAM,CAEzB,CCrCA,SAASS,GAAQC,EAAUC,EAAW,CACpC,OAAAD,EAAI,SAAWC,EAAK,SACpBD,EAAI,SAAWC,EAAK,SACbD,CACT,CA2BA,SAASE,GAAQC,EAAoBF,EAAoB,CACvD,IAAMG,EAAmB,IAAI,IAC7B,QAAWC,KAAMC,EAAY,MAAOH,CAAQ,EAAG,CAC7C,IAAMH,EAAMO,EAAW,MAAOF,CAAE,EAG1BG,EAAQ,CAACT,GAAQ,IAAI,IAAIC,EAAI,WAAY,EAAGC,CAAI,CAAC,EACvDG,EAAQ,IAAI,GAAGI,EAAM,CAAC,CAAC,GAAIA,CAAK,EAGhC,QAAWC,KAAQH,EAAY,kBAAmBD,CAAE,EAAG,CACrD,IAAMK,EAAOD,EAAK,aAAa,MAAM,EACjCC,GAAQ,MACVF,EAAM,KAAKT,GAAQ,IAAI,IAAIW,CAAI,EAAGT,CAAI,CAAC,CAC3C,CACF,CAGA,OAAOG,CACT,CAgBO,SAASO,GAAaV,EAAyC,CACpE,OAAOW,GAAW,IAAI,IAAI,cAAeX,CAAI,CAAC,EAC3C,KACCY,EAAIV,GAAYD,GAAQC,EAAU,IAAI,IAAIF,CAAI,CAAC,CAAC,EAChDa,GAAW,IAAMC,EAAG,IAAI,GAAK,CAAC,CAChC,CACJ,CClDA,SAASC,GACPC,EAAgBC,EACC,CACjB,GAAI,EAAED,EAAG,kBAAkB,SACzB,OAAOE,EAIT,IAAMC,EAAKH,EAAG,OAAO,QAAQ,GAAG,EAChC,GAAIG,IAAO,KACT,OAAOD,EAMT,GAAIC,EAAG,QAAUH,EAAG,SAAWA,EAAG,QAChC,OAAOE,EAQT,IAAME,EAAM,IAAI,IAAID,EAAG,IAAI,EAO3B,OANAC,EAAI,OAASA,EAAI,KAAO,GAMnBH,EAAQ,IAAI,GAAGG,CAAG,EAAE,GASzBJ,EAAG,eAAe,EACXK,EAAG,IAAI,IAAIF,EAAG,IAAI,CAAC,GATjBD,CAUX,CASA,SAASI,GAAKC,EAA8C,CAC1D,IAAMC,EAAO,IAAI,IACjB,QAAWL,KAAMM,EAAY,aAAcF,EAAS,IAAI,EACtDC,EAAK,IAAIL,EAAG,UAAWA,CAAE,EAG3B,OAAOK,CACT,CAYA,SAASE,GAAQH,EAA0C,CACzD,QAAWJ,KAAMM,EAAY,gBAAiBF,CAAQ,EACpD,QAAWI,IAAO,CAAC,OAAQ,KAAK,EAAG,CACjC,IAAMC,EAAQT,EAAG,aAAaQ,CAAG,EACjC,GAAIC,GAAS,CAAC,qBAAqB,KAAKA,CAAK,EAAG,CAE9CT,EAAGQ,CAAG,EAAIR,EAAGQ,CAAG,EAChB,KACF,CACF,CAGF,OAAON,EAAGE,CAAQ,CACpB,CASA,SAASM,GAAOC,EAAsC,CACpD,QAAWC,IAAY,CACrB,+BACA,gCACA,mCACA,+BACA,2BACA,2BACA,GAAGC,EAAQ,wBAAwB,EAC/B,CAAC,0BAA0B,EAC3B,CAAC,CACP,EAAG,CACD,IAAMC,EAASC,GAAmBH,CAAQ,EACpCI,EAASD,GAAmBH,EAAUD,CAAI,EAE9C,OAAOG,GAAW,aAClB,OAAOE,GAAW,aAElBF,EAAO,YAAYE,CAAM,CAE7B,CAGA,IAAMX,EAAOF,GAAK,QAAQ,EAC1B,OAAW,CAACc,EAAMjB,CAAE,IAAKG,GAAKQ,CAAI,EAC5BN,EAAK,IAAIY,CAAI,EACfZ,EAAK,OAAOY,CAAI,EAEhB,SAAS,KAAK,YAAYjB,CAAE,EAGhC,QAAWA,KAAMK,EAAK,OAAO,EAAG,CAC9B,IAAMa,EAAOlB,EAAG,aAAa,MAAM,EAI/BkB,IAAS,eAAiBA,IAAS,gBACrClB,EAAG,OAAO,CACd,CAIA,IAAMmB,EAAYC,GAAoB,WAAW,EACjD,OAAOC,GAAOf,EAAY,SAAUa,CAAS,CAAC,EAC3C,KACCG,EAAUtB,GAAM,CACd,IAAMuB,EAASZ,EAAK,cAAc,QAAQ,EAC1C,GAAIX,EAAG,IAAK,CACV,QAAWkB,KAAQlB,EAAG,kBAAkB,EACtCuB,EAAO,aAAaL,EAAMlB,EAAG,aAAakB,CAAI,CAAE,EAClD,OAAAlB,EAAG,YAAYuB,CAAM,EAGd,IAAIC,EAAWC,GAAY,CAChCF,EAAO,OAAS,IAAME,EAAS,SAAS,CAC1C,CAAC,CAGH,KACE,QAAAF,EAAO,YAAcvB,EAAG,YACxBA,EAAG,YAAYuB,CAAM,EACdxB,CAEX,CAAC,EACD2B,EAAe,EACfC,GAAQ,QAAQ,CAClB,CACJ,CAgBO,SAASC,GACd,CAAE,UAAAC,EAAW,UAAAC,EAAW,UAAAC,CAAU,EACZ,CACtB,IAAMC,EAASC,GAAc,EAC7B,GAAI,SAAS,WAAa,QACxB,OAAOlC,EAIT,IAAMmC,EAAWC,GAAaH,EAAO,IAAI,EAUzC9B,EAAG,QAAQ,EACR,UAAUK,EAAO,EAUpB,IAAM6B,EACJC,EAAsB,SAAS,KAAM,OAAO,EACzC,KACCC,GAAkBJ,CAAQ,EAC1BZ,EAAU,CAAC,CAACzB,EAAIC,CAAO,IAAMF,GAAOC,EAAIC,CAAO,CAAC,EAChDyC,GAAM,CACR,EAIEC,EACJH,EAAyB,OAAQ,UAAU,EACxC,KACCI,EAAIC,EAAW,EACfH,GAAM,CACR,EAMJH,EAAS,KAAKO,GAAeb,CAAS,CAAC,EACpC,UAAU,CAAC,CAAC7B,EAAK,CAAE,OAAA2C,CAAO,CAAC,IAAM,CAChC,QAAQ,aAAaA,EAAQ,EAAE,EAC/B,QAAQ,UAAU,KAAM,GAAI3C,CAAG,CACjC,CAAC,EAMH4C,EAAMT,EAAUI,CAAQ,EACrB,UAAUX,CAAS,EActB,IAAMiB,EACJjB,EAAU,KACRkB,EAAwB,UAAU,EAClCzB,EAAUrB,GAAO+C,GAAY/C,EAAK,CAAE,UAAA8B,CAAU,CAAC,EAC5C,KACCkB,GAAW,KACTC,GAAYjD,EAAK,EAAI,EACdF,EACR,CACH,CACF,EAIAuB,EAAUf,EAAO,EACjBe,EAAUZ,EAAM,EAChB6B,GAAM,CACR,EAUF,OAAAM,EACEC,EAAU,KAAKH,GAAed,EAAW,CAACsB,EAAGlD,IAAQA,CAAG,CAAC,EASzD6C,EAAU,KACRxB,EAAU,IAAMO,CAAS,EACzBkB,EAAwB,UAAU,EAClCzB,EAAU,IAAMO,CAAS,EACzBkB,EAAwB,MAAM,CAChC,EAQAlB,EAAU,KACRuB,EAAqB,CAACC,EAAGC,IACvBD,EAAE,WAAaC,EAAE,UACjBD,EAAE,OAAaC,EAAE,IAClB,EACDhC,EAAU,IAAMc,CAAQ,EACxBmB,EAAI,IAAM,QAAQ,KAAK,CAAC,CAC1B,CACF,EACG,UAAUtD,GAAO,CA1YtB,IAAAuD,EAAAC,EAgZU,QAAQ,QAAU,MAAQ,CAACxD,EAAI,KACjC,OAAO,SAAS,GAAGwD,GAAAD,EAAA,QAAQ,QAAR,YAAAA,EAAe,IAAf,KAAAC,EAAoB,CAAC,GAExC,QAAQ,kBAAoB,OAC5BC,GAAgBzD,EAAI,IAAI,EACxB,QAAQ,kBAAoB,SAEhC,CAAC,EAMH4B,EAAU,UAAU,IAAM,CACxB,QAAQ,kBAAoB,QAC9B,CAAC,EAMDQ,EAAU,OAAQ,cAAc,EAC7B,UAAU,IAAM,CACf,QAAQ,kBAAoB,MAC9B,CAAC,EAMHP,EAAU,KACRiB,EAAwB,QAAQ,EAChCY,GAAa,GAAG,CAClB,EACG,UAAU,CAAC,CAAE,OAAAf,CAAO,IAAM,CACzB,QAAQ,aAAaA,EAAQ,EAAE,CACjC,CAAC,EAGIE,CACT,CClaA,IAAAc,GAAuB,SAqChB,SAASC,GACdC,EAC0B,CAE1B,IAAMC,EAAQD,EAAO,UAAU,MAAM,GAAG,EAAE,IAAIE,GAC/BA,EAAK,QAAQ,sBAAuB,EAAE,EACvC,SAAW,EAAI,SAAMA,CAClC,EACE,KAAK,GAAG,EAELC,EAAY,IAAI,OAAOF,EAAO,KAAK,EACnCG,EAAY,CAACC,EAAYC,EAAcJ,IACpC,GAAGI,CAAI,2BAA2BJ,CAAI,UAI/C,OAAQK,GAAkB,CACxBA,EAAQA,EACL,QAAQ,gBAAiB,GAAG,EAC5B,KAAK,EAGR,IAAMC,EAAQ,IAAI,OAAO,MAAMR,EAAO,SAAS,MAC7CO,EACG,QAAQ,uBAAwB,MAAM,EACtC,QAAQJ,EAAW,GAAG,CAC3B,IAAK,KAAK,EAGV,OAAOM,MAAS,GAAAC,SAAWD,CAAK,EAC7B,QAAQD,EAAOJ,CAAS,EACxB,QAAQ,8BAA+B,IAAI,CAChD,CACF,CCEO,SAASO,GACdC,EAC+B,CAC/B,OAAOA,EAAQ,OAAS,CAC1B,CASO,SAASC,GACdD,EACgC,CAChC,OAAOA,EAAQ,OAAS,CAC1B,CC1CO,SAASE,GACdC,EAAaC,EACW,CACxB,IAAMC,EAAUC,GAA2BH,CAAG,EAC9C,OAAAI,EACEC,EAAG,SAAS,WAAa,OAAO,EAChCC,GAAY,QAAQ,CACtB,EACG,KACCC,GAAMC,GAAUA,CAAM,EACtBC,EAAU,IAAMR,CAAM,CACxB,EACG,UAAU,CAAC,CAAE,OAAAS,EAAQ,KAAAC,CAAK,IAAMT,EAAQ,KAAK,CAC5C,OACA,KAAM,CACJ,OAAAQ,EACA,KAAAC,EACA,QAAS,CACP,QAASC,EAAQ,gBAAgB,CACnC,CACF,CACF,CAAC,CAAC,EAGCV,CACT,CCxBO,SAASW,GACd,CAAE,UAAAC,CAAU,EACN,CACN,IAAMC,EAASC,GAAc,EACvBC,EAAYC,GAChB,IAAI,IAAI,mBAAoBH,EAAO,IAAI,CACzC,EACG,KACCI,GAAW,IAAMC,CAAK,CACxB,EAGIC,EAAWJ,EACd,KACCK,EAAIC,GAAY,CACd,GAAM,CAAC,CAAEC,CAAO,EAAIT,EAAO,KAAK,MAAM,aAAa,EACnD,OAAOQ,EAAS,KAAK,CAAC,CAAE,QAAAE,EAAS,QAAAC,CAAQ,IACvCD,IAAYD,GAAWE,EAAQ,SAASF,CAAO,CAChD,GAAKD,EAAS,CAAC,CAClB,CAAC,CACH,EAGFN,EACG,KACCK,EAAIC,GAAY,IAAI,IAAIA,EAAS,IAAIE,GAAW,CAC9C,GAAG,IAAI,IAAI,MAAMA,EAAQ,OAAO,IAAKV,EAAO,IAAI,CAAC,GACjDU,CACF,CAAC,CAAC,CAAC,EACHE,EAAUC,GAAQC,EAAsB,SAAS,KAAM,OAAO,EAC3D,KACCC,EAAOC,GAAM,CAACA,EAAG,SAAW,CAACA,EAAG,OAAO,EACvCC,GAAeX,CAAQ,EACvBM,EAAU,CAAC,CAACI,EAAIP,CAAO,IAAM,CAC3B,GAAIO,EAAG,kBAAkB,QAAS,CAChC,IAAME,EAAKF,EAAG,OAAO,QAAQ,GAAG,EAChC,GAAIE,GAAM,CAACA,EAAG,QAAUL,EAAK,IAAIK,EAAG,IAAI,EAAG,CACzC,IAAMC,EAAMD,EAAG,KAWf,MAAI,CAACF,EAAG,OAAO,QAAQ,aAAa,GAClBH,EAAK,IAAIM,CAAG,IACZV,EACPJ,GAEXW,EAAG,eAAe,EACXI,EAAGD,CAAG,EACf,CACF,CACA,OAAOd,CACT,CAAC,EACDO,EAAUO,GACDE,GAAa,IAAI,IAAIF,CAAG,CAAC,EAC7B,KACCZ,EAAIe,GAAW,CAEb,IAAMC,EADWC,GAAY,EACP,KAAK,QAAQxB,EAAO,KAAMmB,CAAG,EACnD,OAAOG,EAAQ,IAAIC,EAAK,MAAM,GAAG,EAAE,CAAC,CAAC,EACjC,IAAI,IAAIA,CAAI,EACZ,IAAI,IAAIJ,CAAG,CACjB,CAAC,CACH,CACH,CACH,CACF,CACF,EACG,UAAUA,GAAOM,GAAYN,EAAK,EAAI,CAAC,EAG5CO,EAAc,CAACxB,EAAWI,CAAQ,CAAC,EAChC,UAAU,CAAC,CAACE,EAAUC,CAAO,IAAM,CACpBkB,EAAW,mBAAmB,EACtC,YAAYC,GAAsBpB,EAAUC,CAAO,CAAC,CAC5D,CAAC,EAGHV,EAAU,KAAKa,EAAU,IAAMN,CAAQ,CAAC,EACrC,UAAUG,GAAW,CA3J1B,IAAAoB,EA8JM,IAAIC,EAAW,SAAS,aAAc,cAAc,EACpD,GAAIA,IAAa,KAAM,CACrBA,EAAW,GAGX,IAAIC,IAAUF,EAAA7B,EAAO,UAAP,YAAA6B,EAAgB,UAAW,SACpC,MAAM,QAAQE,CAAO,IACxBA,EAAU,CAACA,CAAO,GAGpBC,EAAM,QAAWC,KAAUF,EACzB,QAAWrB,KAAWD,EAAQ,QAAQ,OAAOA,EAAQ,OAAO,EAC1D,GAAI,IAAI,OAAOwB,EAAQ,GAAG,EAAE,KAAKvB,CAAO,EAAG,CACzCoB,EAAW,GACX,MAAME,CACR,CAGJ,SAAS,aAAcF,EAAU,cAAc,CACjD,CAGA,GAAIA,EACF,QAAWI,KAAWC,GAAqB,UAAU,EACnDD,EAAQ,OAAS,EACvB,CAAC,CACL,CCpFO,SAASE,GACdC,EAAsB,CAAE,QAAAC,CAAQ,EACP,CAGzB,GAAM,CAAE,aAAAC,CAAa,EAAIC,GAAY,EACjCD,EAAa,IAAI,GAAG,IACtBE,GAAU,SAAU,EAAI,EAGxBJ,EAAG,MAAQE,EAAa,IAAI,GAAG,EAC/BF,EAAG,MAAM,EAGTK,GAAY,QAAQ,EACjB,KACCC,GAAMC,GAAU,CAACA,CAAM,CACzB,EACG,UAAU,IAAM,CACf,IAAMC,EAAML,GAAY,EACxBK,EAAI,aAAa,OAAO,GAAG,EAC3B,QAAQ,aAAa,CAAC,EAAG,GAAI,GAAGA,CAAG,EAAE,CACvC,CAAC,GAIP,IAAMC,EAASC,GAAkBV,CAAE,EAC7BW,EAASC,EACbX,EAAQ,KAAKK,GAAMO,EAAoB,CAAC,EACxCC,EAAUd,EAAI,OAAO,EACrBS,CACF,EACG,KACCM,EAAI,IAAMf,EAAG,KAAK,EAClBgB,EAAqB,CACvB,EAGF,OAAOC,EAAc,CAACN,EAAQF,CAAM,CAAC,EAClC,KACCM,EAAI,CAAC,CAACG,EAAOC,CAAK,KAAO,CAAE,MAAAD,EAAO,MAAAC,CAAM,EAAE,EAC1CC,EAAY,CAAC,CACf,CACJ,CAUO,SAASC,GACdrB,EAAsB,CAAE,QAAAC,CAAQ,EACsB,CACtD,IAAMqB,EAAQ,IAAIC,EACZC,EAAQF,EAAM,KAAKG,EAAe,EAAGC,GAAQ,EAAI,CAAC,EAGxDT,EAAc,CACZhB,EAAQ,KAAKK,GAAMO,EAAoB,CAAC,EACxCS,CACF,EAAG,CAACK,EAAGC,IAAUA,CAAK,EACnB,KACCC,EAAwB,OAAO,CACjC,EACG,UAAU,CAAC,CAAE,MAAAX,CAAM,IAAMjB,EAAQ,KAAK,CACrC,OACA,KAAMiB,CACR,CAAC,CAAC,EAGNI,EACG,KACCO,EAAwB,OAAO,CACjC,EACG,UAAU,CAAC,CAAE,MAAAV,CAAM,IAAM,CACpBA,GACFf,GAAU,SAAUe,CAAK,CAC7B,CAAC,EAGLL,EAAUd,EAAG,KAAO,OAAO,EACxB,KACC8B,EAAUN,CAAK,CACjB,EACG,UAAU,IAAMxB,EAAG,MAAM,CAAC,EAM/B,IAAM+B,EAAQC,EAAW,uBAAuB,EAChD,OAAAlB,EAAUiB,EAAO,OAAO,EACrB,UAAU,IAAM/B,EAAG,MAAM,CAAC,EAGtBD,GAAiBC,EAAI,CAAE,QAAAC,CAAQ,CAAC,EACpC,KACCgC,EAAIC,GAASZ,EAAM,KAAKY,CAAK,CAAC,EAC9BC,EAAS,IAAMb,EAAM,SAAS,CAAC,EAC/BP,EAAImB,GAAUE,EAAA,CAAE,IAAKpC,GAAOkC,EAAQ,EACpCd,EAAY,CAAC,CACf,CACJ,CCnHO,SAASiB,GACdC,EAAiB,CAAE,QAAAC,EAAS,OAAAC,CAAO,EACE,CACrC,IAAMC,EAAQ,IAAIC,EACZC,EAAYC,GAAqBN,EAAG,aAAc,EACrD,KACCO,EAAO,OAAO,CAChB,EAGIC,EAAYR,EAAG,cAGfS,EAAOC,EAAW,wBAAyBV,CAAE,EAC7CW,EAAOD,EAAW,uBAAwBV,CAAE,EAGlDY,GAAY,QAAQ,EACjB,UAAUC,GAAUF,EAAK,aACxB,OAAQE,EAAS,OAAS,cAC5B,CAAC,EAGHV,EACG,KACCW,GAAeZ,CAAM,EACrBa,GAAUd,EAAQ,KAAKe,GAAMC,EAAoB,CAAC,CAAC,CACrD,EACG,UAAU,CAAC,CAAC,CAAE,MAAAC,CAAM,EAAG,CAAE,MAAAC,CAAM,CAAC,IAAM,CACrC,OAAQD,EAAM,OAAQ,CAGpB,IAAK,GACHT,EAAK,YAAcU,EAAM,OACrBC,GAAY,oBAAoB,EAChCA,GAAY,2BAA2B,EAC3C,MAGF,IAAK,GACHX,EAAK,YAAcW,GAAY,mBAAmB,EAClD,MAGF,QACE,IAAMC,EAAQC,GAAMJ,EAAM,MAAM,EAChCT,EAAK,YAAcW,GAAY,sBAAuBC,CAAK,CAC/D,CACF,CAAC,EAGL,IAAME,EAAUpB,EACb,KACCqB,EAAI,IAAMb,EAAK,UAAY,EAAE,EAC7Bc,EAAU,CAAC,CAAE,MAAAP,CAAM,IAAMQ,EACvBC,EAAG,GAAGT,EAAM,MAAM,EAAG,EAAE,CAAC,EACxBS,EAAG,GAAGT,EAAM,MAAM,EAAE,CAAC,EAClB,KACCU,GAAY,CAAC,EACbC,GAAQxB,CAAS,EACjBoB,EAAU,CAAC,CAACK,CAAK,IAAMA,CAAK,CAC9B,CACJ,CAAC,EACDC,EAAIC,EAAsB,EAC1BC,GAAM,CACR,EAGF,OAAAV,EAAQ,UAAUW,GAAQvB,EAAK,YAAYuB,CAAI,CAAC,EAChDX,EACG,KACCY,GAASD,GAAQ,CACf,IAAME,EAAUC,GAAmB,UAAWH,CAAI,EAClD,OAAI,OAAOE,GAAY,YACdE,EAGFC,EAAUH,EAAS,QAAQ,EAC/B,KACCI,EAAUrC,CAAK,EACf4B,EAAI,IAAMK,CAAO,CACnB,CACJ,CAAC,CACH,EACG,UAAUA,GAAW,CAElBA,EAAQ,OAAS,IACjBA,EAAQ,WAAa5B,EAAU,WAE/BA,EAAU,SAAS,CAAE,IAAK4B,EAAQ,SAAU,CAAC,CACjD,CAAC,EAGWnC,EACb,KACCM,EAAOkC,EAAqB,EAC5BV,EAAI,CAAC,CAAE,KAAAW,CAAK,IAAMA,CAAI,CACxB,EAIC,KACClB,EAAImB,GAASxC,EAAM,KAAKwC,CAAK,CAAC,EAC9BC,EAAS,IAAMzC,EAAM,SAAS,CAAC,EAC/B4B,EAAIY,GAAUE,EAAA,CAAE,IAAK7C,GAAO2C,EAAQ,CACtC,CACJ,CCpHO,SAASG,GACdC,EAAkB,CAAE,OAAAC,CAAO,EACF,CACzB,OAAOA,EACJ,KACCC,EAAI,CAAC,CAAE,MAAAC,CAAM,IAAM,CACjB,IAAMC,EAAMC,GAAY,EACxB,OAAAD,EAAI,KAAO,GAGXD,EAAQA,EACL,QAAQ,OAAQ,GAAG,EACnB,QAAQ,KAAM,KAAK,EACnB,QAAQ,KAAM,KAAK,EAGtBC,EAAI,OAAS,KAAKD,CAAK,GAChB,CAAE,IAAAC,CAAI,CACf,CAAC,CACH,CACJ,CAUO,SAASE,GACdC,EAAuBC,EACa,CACpC,IAAMC,EAAQ,IAAIC,EACZC,EAAQF,EAAM,KAAKG,EAAe,EAAGC,GAAQ,EAAI,CAAC,EACxD,OAAAJ,EAAM,UAAU,CAAC,CAAE,IAAAL,CAAI,IAAM,CAC3BG,EAAG,aAAa,sBAAuBA,EAAG,IAAI,EAC9CA,EAAG,KAAO,GAAGH,CAAG,EAClB,CAAC,EAGDU,EAAUP,EAAI,OAAO,EAClB,KACCQ,EAAUJ,CAAK,CACjB,EACG,UAAUK,GAAMA,EAAG,eAAe,CAAC,EAGjCjB,GAAiBQ,EAAIC,CAAO,EAChC,KACCS,EAAIC,GAAST,EAAM,KAAKS,CAAK,CAAC,EAC9BC,EAAS,IAAMV,EAAM,SAAS,CAAC,EAC/BP,EAAIgB,GAAUE,EAAA,CAAE,IAAKb,GAAOW,EAAQ,CACtC,CACJ,CCpDO,SAASG,GACdC,EAAiB,CAAE,QAAAC,EAAS,UAAAC,CAAU,EACA,CACtC,IAAMC,EAAQ,IAAIC,EAGZC,EAASC,GAAoB,cAAc,EAC3CC,EAASC,EACbC,EAAUJ,EAAO,SAAS,EAC1BI,EAAUJ,EAAO,OAAO,CAC1B,EACG,KACCK,GAAUC,EAAc,EACxBC,EAAI,IAAMP,EAAM,KAAK,EACrBQ,EAAqB,CACvB,EAGF,OAAAV,EACG,KACCW,GAAkBP,CAAM,EACxBK,EAAI,CAAC,CAAC,CAAE,QAAAG,CAAQ,EAAGC,CAAK,IAAM,CAC5B,IAAMC,EAAQD,EAAM,MAAM,UAAU,EACpC,GAAID,GAAA,MAAAA,EAAS,QAAUE,EAAMA,EAAM,OAAS,CAAC,EAAG,CAC9C,IAAMC,EAAOH,EAAQA,EAAQ,OAAS,CAAC,EACnCG,EAAK,WAAWD,EAAMA,EAAM,OAAS,CAAC,CAAC,IACzCA,EAAMA,EAAM,OAAS,CAAC,EAAIC,EAC9B,MACED,EAAM,OAAS,EAEjB,OAAOA,CACT,CAAC,CACH,EACG,UAAUA,GAASjB,EAAG,UAAYiB,EAChC,KAAK,EAAE,EACP,QAAQ,MAAO,QAAQ,CAC1B,EAGJf,EACG,KACCiB,EAAO,CAAC,CAAE,KAAAC,CAAK,IAAMA,IAAS,QAAQ,CACxC,EACG,UAAUC,GAAO,CAChB,OAAQA,EAAI,KAAM,CAGhB,IAAK,aAEDrB,EAAG,UAAU,QACbK,EAAM,iBAAmBA,EAAM,MAAM,SAErCA,EAAM,MAAQL,EAAG,WACnB,KACJ,CACF,CAAC,EAGWC,EACb,KACCkB,EAAOG,EAAqB,EAC5BV,EAAI,CAAC,CAAE,KAAAW,CAAK,IAAMA,CAAI,CACxB,EAIC,KACCC,EAAIC,GAAStB,EAAM,KAAKsB,CAAK,CAAC,EAC9BC,EAAS,IAAMvB,EAAM,SAAS,CAAC,EAC/BS,EAAI,KAAO,CAAE,IAAKZ,CAAG,EAAE,CACzB,CACJ,CCjDO,SAAS2B,GACdC,EAAiB,CAAE,OAAAC,EAAQ,UAAAC,CAAU,EACN,CAC/B,IAAMC,EAASC,GAAc,EAC7B,GAAI,CACF,IAAMC,EAAUC,GAAkBH,EAAO,OAAQF,CAAM,EAGjDM,EAASC,GAAoB,eAAgBR,CAAE,EAC/CS,EAASD,GAAoB,gBAAiBR,CAAE,EAGtDU,EAAwBV,EAAI,OAAO,EAChC,KACCW,EAAO,CAAC,CAAE,OAAAC,CAAO,IACfA,aAAkB,SAAW,CAAC,CAACA,EAAO,QAAQ,GAAG,CAClD,CACH,EACG,UAAU,IAAMC,GAAU,SAAU,EAAK,CAAC,EAG/CX,EACG,KACCS,EAAO,CAAC,CAAE,KAAAG,CAAK,IAAMA,IAAS,QAAQ,CACxC,EACG,UAAUC,GAAO,CAChB,IAAMC,EAASC,GAAiB,EAChC,OAAQF,EAAI,KAAM,CAGhB,IAAK,QACH,GAAIC,IAAWT,EAAO,CACpB,IAAMW,EAAU,IAAI,IACpB,QAAWC,KAAUC,EACnB,sBAAuBX,CACzB,EAAG,CACD,IAAMY,EAAUF,EAAO,kBACvBD,EAAQ,IAAIC,EAAQ,WAClBE,EAAQ,aAAa,eAAe,CACtC,CAAC,CACH,CAGA,GAAIH,EAAQ,KAAM,CAChB,GAAM,CAAC,CAACI,CAAI,CAAC,EAAI,CAAC,GAAGJ,CAAO,EAAE,KAAK,CAAC,CAAC,CAAEK,CAAC,EAAG,CAAC,CAAEC,CAAC,IAAMA,EAAID,CAAC,EAC1DD,EAAK,MAAM,CACb,CAGAP,EAAI,MAAM,CACZ,CACA,MAGF,IAAK,SACL,IAAK,MACHF,GAAU,SAAU,EAAK,EACzBN,EAAM,KAAK,EACX,MAGF,IAAK,UACL,IAAK,YACH,GAAI,OAAOS,GAAW,YACpBT,EAAM,MAAM,MACP,CACL,IAAMkB,EAAM,CAAClB,EAAO,GAAGa,EACrB,wDACAX,CACF,CAAC,EACKiB,EAAI,KAAK,IAAI,GACjB,KAAK,IAAI,EAAGD,EAAI,QAAQT,CAAM,CAAC,EAAIS,EAAI,QACrCV,EAAI,OAAS,UAAY,GAAK,IAE9BU,EAAI,MAAM,EACdA,EAAIC,CAAC,EAAE,MAAM,CACf,CAGAX,EAAI,MAAM,EACV,MAGF,QACMR,IAAUU,GAAiB,GAC7BV,EAAM,MAAM,CAClB,CACF,CAAC,EAGLL,EACG,KACCS,EAAO,CAAC,CAAE,KAAAG,CAAK,IAAMA,IAAS,QAAQ,CACxC,EACG,UAAUC,GAAO,CAChB,OAAQA,EAAI,KAAM,CAGhB,IAAK,IACL,IAAK,IACL,IAAK,IACHR,EAAM,MAAM,EACZA,EAAM,OAAO,EAGbQ,EAAI,MAAM,EACV,KACJ,CACF,CAAC,EAGL,IAAMY,EAASC,GAAiBrB,EAAO,CAAE,QAAAF,CAAQ,CAAC,EAClD,OAAOwB,EACLF,EACAG,GAAkBrB,EAAQ,CAAE,QAAAJ,EAAS,OAAAsB,CAAO,CAAC,CAC/C,EACG,KACCI,GAGE,GAAGC,GAAqB,eAAgBhC,CAAE,EACvC,IAAIiC,GAASC,GAAiBD,EAAO,CAAE,OAAAN,CAAO,CAAC,CAAC,EAGnD,GAAGK,GAAqB,iBAAkBhC,CAAE,EACzC,IAAIiC,GAASE,GAAmBF,EAAO,CAAE,QAAA5B,EAAS,UAAAH,CAAU,CAAC,CAAC,CACnE,CACF,CAGJ,OAASkC,EAAK,CACZ,OAAApC,EAAG,OAAS,GACLqC,EACT,CACF,CCnKO,SAASC,GACdC,EAAiB,CAAE,OAAAC,EAAQ,UAAAC,CAAU,EACG,CACxC,OAAOC,EAAc,CACnBF,EACAC,EACG,KACCE,EAAUC,GAAY,CAAC,EACvBC,EAAOC,GAAO,CAAC,CAACA,EAAI,aAAa,IAAI,GAAG,CAAC,CAC3C,CACJ,CAAC,EACE,KACCC,EAAI,CAAC,CAACC,EAAOF,CAAG,IAAMG,GAAuBD,EAAM,MAAM,EACvDF,EAAI,aAAa,IAAI,GAAG,CAC1B,CAAC,EACDC,EAAIG,GAAM,CA1FhB,IAAAC,EA2FQ,IAAMC,EAAQ,IAAI,IAGZC,EAAK,SAAS,mBAAmBd,EAAI,WAAW,SAAS,EAC/D,QAASe,EAAOD,EAAG,SAAS,EAAGC,EAAMA,EAAOD,EAAG,SAAS,EACtD,IAAIF,EAAAG,EAAK,gBAAL,MAAAH,EAAoB,aAAc,CACpC,IAAMI,EAAWD,EAAK,YAChBE,EAAWN,EAAGK,CAAQ,EACxBC,EAAS,OAASD,EAAS,QAC7BH,EAAM,IAAIE,EAAmBE,CAAQ,CACzC,CAIF,OAAW,CAACF,EAAMG,CAAI,IAAKL,EAAO,CAChC,GAAM,CAAE,WAAAM,CAAW,EAAIC,EAAE,OAAQ,KAAMF,CAAI,EAC3CH,EAAK,YAAY,GAAG,MAAM,KAAKI,CAAU,CAAC,CAC5C,CAGA,MAAO,CAAE,IAAKnB,EAAI,MAAAa,CAAM,CAC1B,CAAC,CACH,CACJ,CCPO,SAASQ,GACdC,EAAiB,CAAE,UAAAC,EAAW,MAAAC,CAAM,EACf,CACrB,IAAMC,EAASH,EAAG,QAAqB,UAAU,EAC3CI,EACJD,EAAO,UACPA,EAAO,cAAe,UAGxB,OAAOE,EAAc,CAACH,EAAOD,CAAS,CAAC,EACpC,KACCK,EAAI,CAAC,CAAC,CAAE,OAAAC,EAAQ,OAAAC,CAAO,EAAG,CAAE,OAAQ,CAAE,EAAAC,CAAE,CAAE,CAAC,KACzCD,EAASA,EACL,KAAK,IAAIJ,EAAQ,KAAK,IAAI,EAAGK,EAAIF,CAAM,CAAC,EACxCH,EACG,CACL,OAAAI,EACA,OAAQC,GAAKF,EAASH,CACxB,EACD,EACDM,EAAqB,CAACC,EAAGC,IACvBD,EAAE,SAAWC,EAAE,QACfD,EAAE,SAAWC,EAAE,MAChB,CACH,CACJ,CAuBO,SAASC,GACdb,EAAiBc,EACe,CADf,IAAAC,EAAAD,EAAE,SAAAE,CA5JrB,EA4JmBD,EAAcE,EAAAC,GAAdH,EAAc,CAAZ,YAEnB,IAAMI,EAAQC,EAAW,0BAA2BpB,CAAE,EAChD,CAAE,EAAAS,CAAE,EAAIY,GAAiBF,CAAK,EACpC,OAAOG,EAAM,IAAM,CACjB,IAAMC,EAAQ,IAAIC,EACZC,EAAQF,EAAM,KAAKG,EAAe,EAAGC,GAAQ,EAAI,CAAC,EAClDC,EAAQL,EACX,KACCM,GAAU,EAAGC,EAAuB,CACtC,EAGF,OAAAF,EAAM,KAAKG,GAAef,CAAO,CAAC,EAC/B,UAAU,CAGT,KAAK,CAAC,CAAE,OAAAR,CAAO,EAAG,CAAE,OAAQD,CAAO,CAAC,EAAG,CACrCY,EAAM,MAAM,OAAS,GAAGX,EAAS,EAAIC,CAAC,KACtCT,EAAG,MAAM,IAAY,GAAGO,CAAM,IAChC,EAGA,UAAW,CACTY,EAAM,MAAM,OAAS,GACrBnB,EAAG,MAAM,IAAY,EACvB,CACF,CAAC,EAGH4B,EAAM,KAAKI,GAAM,CAAC,EACf,UAAU,IAAM,CACf,QAAWC,KAAQC,EAAY,8BAA+BlC,CAAE,EAAG,CACjE,GAAI,CAACiC,EAAK,aACR,SACF,IAAME,EAAYF,EAAK,QAAqB,yBAAyB,EACrE,GAAI,OAAOE,GAAc,YAAa,CACpC,IAAM5B,EAAS0B,EAAK,UAAYE,EAAU,UACpC,CAAE,OAAA3B,CAAO,EAAI4B,GAAeD,CAAS,EAC3CA,EAAU,SAAS,CACjB,IAAK5B,EAASC,EAAS,CACzB,CAAC,CACH,CACF,CACF,CAAC,EAGH6B,GAAKH,EAA8B,kBAAmBlC,CAAE,CAAC,EACtD,KACCsC,GAASC,GAASC,EAAUD,EAAO,OAAO,EACvC,KACCE,GAAUC,EAAc,EACxBpC,EAAI,IAAMiC,CAAK,EACfI,EAAUlB,CAAK,CACjB,CACF,CACF,EACG,UAAUc,GAAS,CAClB,IAAMK,EAAQxB,EAA6B,QAAQmB,EAAM,OAAO,IAAI,EACxDnB,EAAW,qBAAqBmB,EAAM,EAAE,IAAI,EACpD,aAAa,gBAAiB,GAAGK,EAAM,OAAO,EAAE,CACtD,CAAC,EAGE7C,GAAaC,EAAIiB,CAAO,EAC5B,KACC4B,EAAIC,GAASvB,EAAM,KAAKuB,CAAK,CAAC,EAC9BC,EAAS,IAAMxB,EAAM,SAAS,CAAC,EAC/BjB,EAAIwC,GAAUE,EAAA,CAAE,IAAKhD,GAAO8C,EAAQ,CACtC,CACJ,CAAC,CACH,CCxKO,SAASG,GACdC,EAAcC,EACW,CACzB,GAAI,OAAOA,GAAS,YAAa,CAC/B,IAAMC,EAAM,gCAAgCF,CAAI,IAAIC,CAAI,GACxD,OAAOE,GAGLC,GAAqB,GAAGF,CAAG,kBAAkB,EAC1C,KACCG,GAAW,IAAMC,CAAK,EACtBC,EAAIC,IAAY,CACd,QAASA,EAAQ,QACnB,EAAE,EACFC,GAAe,CAAC,CAAC,CACnB,EAGFL,GAAkBF,CAAG,EAClB,KACCG,GAAW,IAAMC,CAAK,EACtBC,EAAIG,IAAS,CACX,MAAOA,EAAK,iBACZ,MAAOA,EAAK,WACd,EAAE,EACFD,GAAe,CAAC,CAAC,CACnB,CACJ,EACG,KACCF,EAAI,CAAC,CAACC,EAASE,CAAI,IAAOC,IAAA,GAAKH,GAAYE,EAAO,CACpD,CAGJ,KAAO,CACL,IAAMR,EAAM,gCAAgCF,CAAI,GAChD,OAAOI,GAAkBF,CAAG,EACzB,KACCK,EAAIG,IAAS,CACX,aAAcA,EAAK,YACrB,EAAE,EACFD,GAAe,CAAC,CAAC,CACnB,CACJ,CACF,CCvDO,SAASG,GACdC,EAAcC,EACW,CACzB,IAAMC,EAAM,WAAWF,CAAI,oBAAoB,mBAAmBC,CAAO,CAAC,GAC1E,OAAOE,GAA2BD,CAAG,EAClC,KACCE,GAAW,IAAMC,CAAK,EACtBC,EAAI,CAAC,CAAE,WAAAC,EAAY,YAAAC,CAAY,KAAO,CACpC,MAAOD,EACP,MAAOC,CACT,EAAE,EACFC,GAAe,CAAC,CAAC,CACnB,CACJ,CCOO,SAASC,GACdC,EACyB,CAGzB,IAAIC,EAAQD,EAAI,MAAM,qCAAqC,EAC3D,GAAIC,EAAO,CACT,GAAM,CAAC,CAAEC,EAAMC,CAAI,EAAIF,EACvB,OAAOG,GAA2BF,EAAMC,CAAI,CAC9C,CAIA,GADAF,EAAQD,EAAI,MAAM,oCAAoC,EAClDC,EAAO,CACT,GAAM,CAAC,CAAEI,EAAMC,CAAI,EAAIL,EACvB,OAAOM,GAA2BF,EAAMC,CAAI,CAC9C,CAGA,OAAOE,CACT,CCpBA,IAAIC,GAgBG,SAASC,GACdC,EACoB,CACpB,OAAOF,QAAWG,EAAM,IAAM,CAC5B,IAAMC,EAAS,SAAsB,WAAY,cAAc,EAC/D,GAAIA,EACF,OAAOC,EAAGD,CAAM,EAKhB,GADYE,GAAqB,SAAS,EAClC,OAAQ,CACd,IAAMC,EAAU,SAA0B,WAAW,EACrD,GAAI,EAAEA,GAAWA,EAAQ,QACvB,OAAOC,CACX,CAGA,OAAOC,GAAiBP,EAAG,IAAI,EAC5B,KACCQ,EAAIC,GAAS,SAAS,WAAYA,EAAO,cAAc,CAAC,CAC1D,CAEN,CAAC,EACE,KACCC,GAAW,IAAMJ,CAAK,EACtBK,EAAOF,GAAS,OAAO,KAAKA,CAAK,EAAE,OAAS,CAAC,EAC7CG,EAAIH,IAAU,CAAE,MAAAA,CAAM,EAAE,EACxBI,EAAY,CAAC,CACf,EACJ,CASO,SAASC,GACdd,EAC+B,CAC/B,IAAMe,EAAQC,EAAW,uBAAwBhB,CAAE,EACnD,OAAOC,EAAM,IAAM,CACjB,IAAMgB,EAAQ,IAAIC,EAClB,OAAAD,EAAM,UAAU,CAAC,CAAE,MAAAR,CAAM,IAAM,CAC7BM,EAAM,YAAYI,GAAkBV,CAAK,CAAC,EAC1CM,EAAM,UAAU,IAAI,+BAA+B,CACrD,CAAC,EAGMhB,GAAYC,CAAE,EAClB,KACCQ,EAAIY,GAASH,EAAM,KAAKG,CAAK,CAAC,EAC9BC,EAAS,IAAMJ,EAAM,SAAS,CAAC,EAC/BL,EAAIQ,GAAUE,EAAA,CAAE,IAAKtB,GAAOoB,EAAQ,CACtC,CACJ,CAAC,CACH,CCtDO,SAASG,GACdC,EAAiB,CAAE,UAAAC,EAAW,QAAAC,CAAQ,EACpB,CAClB,OAAOC,GAAiB,SAAS,IAAI,EAClC,KACCC,EAAU,IAAMC,GAAgBL,EAAI,CAAE,QAAAE,EAAS,UAAAD,CAAU,CAAC,CAAC,EAC3DK,EAAI,CAAC,CAAE,OAAQ,CAAE,EAAAC,CAAE,CAAE,KACZ,CACL,OAAQA,GAAK,EACf,EACD,EACDC,EAAwB,QAAQ,CAClC,CACJ,CAaO,SAASC,GACdT,EAAiBU,EACY,CAC7B,OAAOC,EAAM,IAAM,CACjB,IAAMC,EAAQ,IAAIC,EAClB,OAAAD,EAAM,UAAU,CAGd,KAAK,CAAE,OAAAE,CAAO,EAAG,CACfd,EAAG,OAASc,CACd,EAGA,UAAW,CACTd,EAAG,OAAS,EACd,CACF,CAAC,GAICe,EAAQ,wBAAwB,EAC5BC,EAAG,CAAE,OAAQ,EAAM,CAAC,EACpBjB,GAAUC,EAAIU,CAAO,GAExB,KACCO,EAAIC,GAASN,EAAM,KAAKM,CAAK,CAAC,EAC9BC,EAAS,IAAMP,EAAM,SAAS,CAAC,EAC/BN,EAAIY,GAAUE,EAAA,CAAE,IAAKpB,GAAOkB,EAAQ,CACtC,CACJ,CAAC,CACH,CCfO,SAASG,GACdC,EAAiB,CAAE,UAAAC,EAAW,QAAAC,CAAQ,EACT,CAC7B,IAAMC,EAAQ,IAAI,IAGZC,EAAUC,EAA+B,gBAAiBL,CAAE,EAClE,QAAWM,KAAUF,EAAS,CAC5B,IAAMG,EAAK,mBAAmBD,EAAO,KAAK,UAAU,CAAC,CAAC,EAChDE,EAASC,GAAmB,QAAQF,CAAE,IAAI,EAC5C,OAAOC,GAAW,aACpBL,EAAM,IAAIG,EAAQE,CAAM,CAC5B,CAGA,IAAME,EAAUR,EACb,KACCS,EAAwB,QAAQ,EAChCC,EAAI,CAAC,CAAE,OAAAC,CAAO,IAAM,CAClB,IAAMC,EAAOC,GAAoB,MAAM,EACjCC,EAAOC,EAAW,wBAAyBH,CAAI,EACrD,OAAOD,EAAS,IACdG,EAAK,UACLF,EAAK,UAET,CAAC,EACDI,GAAM,CACR,EAqFF,OAlFmBC,GAAiB,SAAS,IAAI,EAC9C,KACCR,EAAwB,QAAQ,EAGhCS,EAAUC,GAAQC,EAAM,IAAM,CAC5B,IAAIC,EAA4B,CAAC,EACjC,OAAOC,EAAG,CAAC,GAAGrB,CAAK,EAAE,OAAO,CAACsB,EAAO,CAACnB,EAAQE,CAAM,IAAM,CACvD,KAAOe,EAAK,QACGpB,EAAM,IAAIoB,EAAKA,EAAK,OAAS,CAAC,CAAC,EACnC,SAAWf,EAAO,SACzBe,EAAK,IAAI,EAOb,IAAIG,EAASlB,EAAO,UACpB,KAAO,CAACkB,GAAUlB,EAAO,eACvBA,EAASA,EAAO,cAChBkB,EAASlB,EAAO,UAIlB,IAAImB,EAASnB,EAAO,aACpB,KAAOmB,EAAQA,EAASA,EAAO,aAC7BD,GAAUC,EAAO,UAGnB,OAAOF,EAAM,IACX,CAAC,GAAGF,EAAO,CAAC,GAAGA,EAAMjB,CAAM,CAAC,EAAE,QAAQ,EACtCoB,CACF,CACF,EAAG,IAAI,GAAkC,CAAC,CAC5C,CAAC,EACE,KAGCd,EAAIa,GAAS,IAAI,IAAI,CAAC,GAAGA,CAAK,EAAE,KAAK,CAAC,CAAC,CAAEG,CAAC,EAAG,CAAC,CAAEC,CAAC,IAAMD,EAAIC,CAAC,CAAC,CAAC,EAC9DC,GAAkBpB,CAAO,EAGzBU,EAAU,CAAC,CAACK,EAAOM,CAAM,IAAM9B,EAC5B,KACC+B,GAAK,CAAC,CAACC,EAAMC,CAAI,EAAG,CAAE,OAAQ,CAAE,EAAAC,CAAE,EAAG,KAAAC,CAAK,IAAM,CAC9C,IAAMC,EAAOF,EAAIC,EAAK,QAAU,KAAK,MAAMf,EAAK,MAAM,EAGtD,KAAOa,EAAK,QAAQ,CAClB,GAAM,CAAC,CAAER,CAAM,EAAIQ,EAAK,CAAC,EACzB,GAAIR,EAASK,EAASI,GAAKE,EACzBJ,EAAO,CAAC,GAAGA,EAAMC,EAAK,MAAM,CAAE,MAE9B,MAEJ,CAGA,KAAOD,EAAK,QAAQ,CAClB,GAAM,CAAC,CAAEP,CAAM,EAAIO,EAAKA,EAAK,OAAS,CAAC,EACvC,GAAIP,EAASK,GAAUI,GAAK,CAACE,EAC3BH,EAAO,CAACD,EAAK,IAAI,EAAI,GAAGC,CAAI,MAE5B,MAEJ,CAGA,MAAO,CAACD,EAAMC,CAAI,CACpB,EAAG,CAAC,CAAC,EAAG,CAAC,GAAGT,CAAK,CAAC,CAAC,EACnBa,EAAqB,CAACV,EAAGC,IACvBD,EAAE,CAAC,IAAMC,EAAE,CAAC,GACZD,EAAE,CAAC,IAAMC,EAAE,CAAC,CACb,CACH,CACF,CACF,CACF,CACF,EAIC,KACCjB,EAAI,CAAC,CAACqB,EAAMC,CAAI,KAAO,CACrB,KAAMD,EAAK,IAAI,CAAC,CAACV,CAAI,IAAMA,CAAI,EAC/B,KAAMW,EAAK,IAAI,CAAC,CAACX,CAAI,IAAMA,CAAI,CACjC,EAAE,EAGFgB,EAAU,CAAE,KAAM,CAAC,EAAG,KAAM,CAAC,CAAE,CAAC,EAChCC,GAAY,EAAG,CAAC,EAChB5B,EAAI,CAAC,CAACgB,EAAGC,CAAC,IAGJD,EAAE,KAAK,OAASC,EAAE,KAAK,OAClB,CACL,KAAMA,EAAE,KAAK,MAAM,KAAK,IAAI,EAAGD,EAAE,KAAK,OAAS,CAAC,EAAGC,EAAE,KAAK,MAAM,EAChE,KAAM,CAAC,CACT,EAIO,CACL,KAAMA,EAAE,KAAK,MAAM,EAAE,EACrB,KAAMA,EAAE,KAAK,MAAM,EAAGA,EAAE,KAAK,OAASD,EAAE,KAAK,MAAM,CACrD,CAEH,CACH,CACJ,CAYO,SAASa,GACdzC,EAAiB,CAAE,UAAAC,EAAW,QAAAC,EAAS,MAAAwC,EAAO,QAAAC,CAAQ,EACd,CACxC,OAAOrB,EAAM,IAAM,CACjB,IAAMsB,EAAQ,IAAIC,EACZC,EAAQF,EAAM,KAAKG,EAAe,EAAGC,GAAQ,EAAI,CAAC,EAoBxD,GAnBAJ,EAAM,UAAU,CAAC,CAAE,KAAAX,EAAM,KAAAC,CAAK,IAAM,CAGlC,OAAW,CAAC5B,CAAM,IAAK4B,EACrB5B,EAAO,UAAU,OAAO,sBAAsB,EAC9CA,EAAO,UAAU,OAAO,sBAAsB,EAIhD,OAAW,CAACmB,EAAO,CAACnB,CAAM,CAAC,IAAK2B,EAAK,QAAQ,EAC3C3B,EAAO,UAAU,IAAI,sBAAsB,EAC3CA,EAAO,UAAU,OACf,uBACAmB,IAAUQ,EAAK,OAAS,CAC1B,CAEJ,CAAC,EAGGgB,EAAQ,YAAY,EAAG,CAGzB,IAAMC,EAAUC,EACdlD,EAAU,KAAKmD,GAAa,CAAC,EAAGxC,EAAI,IAAG,EAAY,CAAC,EACpDX,EAAU,KAAKmD,GAAa,GAAG,EAAGxC,EAAI,IAAM,QAAiB,CAAC,CAChE,EAGAgC,EACG,KACCS,EAAO,CAAC,CAAE,KAAApB,CAAK,IAAMA,EAAK,OAAS,CAAC,EACpCH,GAAkBY,EAAM,KAAKY,GAAUC,EAAc,CAAC,CAAC,EACvDC,GAAeN,CAAO,CACxB,EACG,UAAU,CAAC,CAAC,CAAC,CAAE,KAAAjB,CAAK,CAAC,EAAGwB,CAAQ,IAAM,CACrC,GAAM,CAACnD,CAAM,EAAI2B,EAAKA,EAAK,OAAS,CAAC,EACrC,GAAI3B,EAAO,aAAc,CAGvB,IAAMoD,EAAYC,GAAoBrD,CAAM,EAC5C,GAAI,OAAOoD,GAAc,YAAa,CACpC,IAAMhC,EAASpB,EAAO,UAAYoD,EAAU,UACtC,CAAE,OAAA7C,CAAO,EAAI+C,GAAeF,CAAS,EAC3CA,EAAU,SAAS,CACjB,IAAKhC,EAASb,EAAS,EACvB,SAAA4C,CACF,CAAC,CACH,CACF,CACF,CAAC,CACP,CAGA,OAAIR,EAAQ,qBAAqB,GAC/BhD,EACG,KACC4D,EAAUf,CAAK,EACfnC,EAAwB,QAAQ,EAChCyC,GAAa,GAAG,EAChBU,GAAK,CAAC,EACND,EAAUlB,EAAQ,KAAKmB,GAAK,CAAC,CAAC,CAAC,EAC/BC,GAAO,CAAE,MAAO,GAAI,CAAC,EACrBP,GAAeZ,CAAK,CACtB,EACG,UAAU,CAAC,CAAC,CAAE,CAAE,KAAAX,CAAK,CAAC,IAAM,CAC3B,IAAM+B,EAAMC,GAAY,EAGlB3D,EAAS2B,EAAKA,EAAK,OAAS,CAAC,EACnC,GAAI3B,GAAUA,EAAO,OAAQ,CAC3B,GAAM,CAAC4D,CAAM,EAAI5D,EACX,CAAE,KAAA6D,CAAK,EAAI,IAAI,IAAID,EAAO,IAAI,EAChCF,EAAI,OAASG,IACfH,EAAI,KAAOG,EACX,QAAQ,aAAa,CAAC,EAAG,GAAI,GAAGH,CAAG,EAAE,EAIzC,MACEA,EAAI,KAAO,GACX,QAAQ,aAAa,CAAC,EAAG,GAAI,GAAGA,CAAG,EAAE,CAEzC,CAAC,EAGAjE,GAAqBC,EAAI,CAAE,UAAAC,EAAW,QAAAC,CAAQ,CAAC,EACnD,KACCkE,EAAIC,GAASzB,EAAM,KAAKyB,CAAK,CAAC,EAC9BC,EAAS,IAAM1B,EAAM,SAAS,CAAC,EAC/BhC,EAAIyD,GAAUE,EAAA,CAAE,IAAKvE,GAAOqE,EAAQ,CACtC,CACJ,CAAC,CACH,CC9RO,SAASG,GACdC,EAAkB,CAAE,UAAAC,EAAW,MAAAC,EAAO,QAAAC,CAAQ,EACvB,CAGvB,IAAMC,EAAaH,EAChB,KACCI,EAAI,CAAC,CAAE,OAAQ,CAAE,EAAAC,CAAE,CAAE,IAAMA,CAAC,EAC5BC,GAAY,EAAG,CAAC,EAChBF,EAAI,CAAC,CAAC,EAAGG,CAAC,IAAM,EAAIA,GAAKA,EAAI,CAAC,EAC9BC,EAAqB,CACvB,EAGIC,EAAUR,EACb,KACCG,EAAI,CAAC,CAAE,OAAAM,CAAO,IAAMA,CAAM,CAC5B,EAGF,OAAOC,EAAc,CAACF,EAASN,CAAU,CAAC,EACvC,KACCC,EAAI,CAAC,CAACM,EAAQE,CAAS,IAAM,EAAEF,GAAUE,EAAU,EACnDJ,EAAqB,EACrBK,EAAUX,EAAQ,KAAKY,GAAK,CAAC,CAAC,CAAC,EAC/BC,GAAQ,EAAI,EACZC,GAAO,CAAE,MAAO,GAAI,CAAC,EACrBZ,EAAIa,IAAW,CAAE,OAAAA,CAAO,EAAE,CAC5B,CACJ,CAYO,SAASC,GACdC,EAAiB,CAAE,UAAAnB,EAAW,QAAAoB,EAAS,MAAAnB,EAAO,QAAAC,CAAQ,EACpB,CAClC,IAAMmB,EAAQ,IAAIC,EACZC,EAAQF,EAAM,KAAKG,EAAe,EAAGT,GAAQ,EAAI,CAAC,EACxD,OAAAM,EAAM,UAAU,CAGd,KAAK,CAAE,OAAAJ,CAAO,EAAG,CACfE,EAAG,OAASF,EACRA,GACFE,EAAG,aAAa,WAAY,IAAI,EAChCA,EAAG,KAAK,GAERA,EAAG,gBAAgB,UAAU,CAEjC,EAGA,UAAW,CACTA,EAAG,MAAM,IAAM,GACfA,EAAG,OAAS,GACZA,EAAG,gBAAgB,UAAU,CAC/B,CACF,CAAC,EAGDC,EACG,KACCP,EAAUU,CAAK,EACfE,EAAwB,QAAQ,CAClC,EACG,UAAU,CAAC,CAAE,OAAAC,CAAO,IAAM,CACzBP,EAAG,MAAM,IAAM,GAAGO,EAAS,EAAE,IAC/B,CAAC,EAGLC,EAAUR,EAAI,OAAO,EAClB,UAAUS,GAAM,CACfA,EAAG,eAAe,EAClB,OAAO,SAAS,CAAE,IAAK,CAAE,CAAC,CAC5B,CAAC,EAGI9B,GAAeqB,EAAI,CAAE,UAAAnB,EAAW,MAAAC,EAAO,QAAAC,CAAQ,CAAC,EACpD,KACC2B,EAAIC,GAAST,EAAM,KAAKS,CAAK,CAAC,EAC9BC,EAAS,IAAMV,EAAM,SAAS,CAAC,EAC/BjB,EAAI0B,GAAUE,EAAA,CAAE,IAAKb,GAAOW,EAAQ,CACtC,CACJ,CClHO,SAASG,GACd,CAAE,UAAAC,EAAW,UAAAC,CAAU,EACjB,CACND,EACG,KACCE,EAAU,IAAMC,EAAY,cAAc,CAAC,EAC3CC,GAASC,GAAMC,GAAuBD,CAAE,EACrC,KACCE,EAAUP,EAAU,KAAKQ,GAAK,CAAC,CAAC,CAAC,EACjCC,EAAOC,GAAWA,CAAO,EACzBC,EAAI,IAAMN,CAAE,EACZO,GAAK,CAAC,CACR,CACF,EACAH,EAAOJ,GAAMA,EAAG,YAAcA,EAAG,WAAW,EAC5CD,GAASC,GAAM,CACb,IAAMQ,EAAOR,EAAG,UACVS,EAAOT,EAAG,QAAQ,GAAG,GAAKA,EAIhC,OAHAS,EAAK,MAAQD,EAGRE,EAAQ,kBAAkB,EAIxBC,GAAoBF,EAAM,CAAE,UAAAb,CAAU,CAAC,EAC3C,KACCM,EAAUP,EAAU,KAAKQ,GAAK,CAAC,CAAC,CAAC,EACjCS,EAAS,IAAMH,EAAK,gBAAgB,OAAO,CAAC,CAC9C,EAPOI,CAQX,CAAC,CACH,EACG,UAAU,EAGXH,EAAQ,kBAAkB,GAC5Bf,EACG,KACCE,EAAU,IAAMC,EAAY,YAAY,CAAC,EACzCC,GAASC,GAAMW,GAAoBX,EAAI,CAAE,UAAAJ,CAAU,CAAC,CAAC,CACvD,EACG,UAAU,CACnB,CCpDO,SAASkB,GACd,CAAE,UAAAC,EAAW,QAAAC,CAAQ,EACf,CACND,EACG,KACCE,EAAU,IAAMC,EACd,2BACF,CAAC,EACDC,EAAIC,GAAM,CACRA,EAAG,cAAgB,GACnBA,EAAG,QAAU,EACf,CAAC,EACDC,GAASD,GAAME,EAAUF,EAAI,QAAQ,EAClC,KACCG,GAAU,IAAMH,EAAG,UAAU,SAAS,0BAA0B,CAAC,EACjEI,EAAI,IAAMJ,CAAE,CACd,CACF,EACAK,GAAeT,CAAO,CACxB,EACG,UAAU,CAAC,CAACI,EAAIM,CAAM,IAAM,CAC3BN,EAAG,UAAU,OAAO,0BAA0B,EAC1CM,IACFN,EAAG,QAAU,GACjB,CAAC,CACP,CC9BA,SAASO,IAAyB,CAChC,MAAO,qBAAqB,KAAK,UAAU,SAAS,CACtD,CAiBO,SAASC,GACd,CAAE,UAAAC,CAAU,EACN,CACNA,EACG,KACCC,EAAU,IAAMC,EAAY,qBAAqB,CAAC,EAClDC,EAAIC,GAAMA,EAAG,gBAAgB,mBAAmB,CAAC,EACjDC,EAAOP,EAAa,EACpBQ,GAASF,GAAMG,EAAUH,EAAI,YAAY,EACtC,KACCI,EAAI,IAAMJ,CAAE,CACd,CACF,CACF,EACG,UAAUA,GAAM,CACf,IAAMK,EAAML,EAAG,UAGXK,IAAQ,EACVL,EAAG,UAAY,EAGNK,EAAML,EAAG,eAAiBA,EAAG,eACtCA,EAAG,UAAYK,EAAM,EAEzB,CAAC,CACP,CCpCO,SAASC,GACd,CAAE,UAAAC,EAAW,QAAAC,CAAQ,EACf,CACNC,EAAc,CAACC,GAAY,QAAQ,EAAGF,CAAO,CAAC,EAC3C,KACCG,EAAI,CAAC,CAACC,EAAQC,CAAM,IAAMD,GAAU,CAACC,CAAM,EAC3CC,EAAUF,GAAUG,EAAGH,CAAM,EAC1B,KACCI,GAAMJ,EAAS,IAAM,GAAG,CAC1B,CACF,EACAK,GAAeV,CAAS,CAC1B,EACG,UAAU,CAAC,CAACK,EAAQ,CAAE,OAAQ,CAAE,EAAAM,CAAE,CAAC,CAAC,IAAM,CACzC,GAAIN,EACF,SAAS,KAAK,aAAa,qBAAsB,EAAE,EACnD,SAAS,KAAK,MAAM,IAAM,IAAIM,CAAC,SAC1B,CACL,IAAMC,EAAQ,GAAK,SAAS,SAAS,KAAK,MAAM,IAAK,EAAE,EACvD,SAAS,KAAK,gBAAgB,oBAAoB,EAClD,SAAS,KAAK,MAAM,IAAM,GACtBA,GACF,OAAO,SAAS,EAAGA,CAAK,CAC5B,CACF,CAAC,CACP,CC7DK,OAAO,UACV,OAAO,QAAU,SAAUC,EAAa,CACtC,IAAMC,EAA2B,CAAC,EAClC,QAAWC,KAAO,OAAO,KAAKF,CAAG,EAE/BC,EAAK,KAAK,CAACC,EAAKF,EAAIE,CAAG,CAAC,CAAC,EAG3B,OAAOD,CACT,GAGG,OAAO,SACV,OAAO,OAAS,SAAUD,EAAa,CACrC,IAAMC,EAAiB,CAAC,EACxB,QAAWC,KAAO,OAAO,KAAKF,CAAG,EAE/BC,EAAK,KAAKD,EAAIE,CAAG,CAAC,EAGpB,OAAOD,CACT,GAKE,OAAO,SAAY,cAGhB,QAAQ,UAAU,WACrB,QAAQ,UAAU,SAAW,SAC3BE,EAA8BC,EACxB,CACF,OAAOD,GAAM,UACf,KAAK,WAAaA,EAAE,KACpB,KAAK,UAAYA,EAAE,MAEnB,KAAK,WAAaA,EAClB,KAAK,UAAYC,EAErB,GAGG,QAAQ,UAAU,cACrB,QAAQ,UAAU,YAAc,YAC3BC,EACG,CACN,IAAMC,EAAS,KAAK,WACpB,GAAIA,EAAQ,CACND,EAAM,SAAW,GACnBC,EAAO,YAAY,IAAI,EAGzB,QAASC,EAAIF,EAAM,OAAS,EAAGE,GAAK,EAAGA,IAAK,CAC1C,IAAIC,EAAOH,EAAME,CAAC,EACd,OAAOC,GAAS,SAClBA,EAAO,SAAS,eAAeA,CAAI,EAC5BA,EAAK,YACZA,EAAK,WAAW,YAAYA,CAAI,EAG7BD,EAGHD,EAAO,aAAa,KAAK,gBAAkBE,CAAI,EAF/CF,EAAO,aAAaE,EAAM,IAAI,CAGlC,CACF,CACF,I1MMJ,SAASC,IAA4C,CACnD,OAAI,SAAS,WAAa,QACjBC,GACL,GAAG,IAAI,IAAI,yBAA0BC,GAAO,IAAI,CAAC,EACnD,EACG,KAECC,EAAI,IAAM,OAAO,EACjBC,EAAY,CAAC,CACf,EAEKC,GACL,IAAI,IAAI,2BAA4BH,GAAO,IAAI,CACjD,CAEJ,CAOA,SAAS,gBAAgB,UAAU,OAAO,OAAO,EACjD,SAAS,gBAAgB,UAAU,IAAI,IAAI,EAG3C,IAAMI,GAAYC,GAAc,EAC1BC,GAAYC,GAAc,EAC1BC,GAAYC,GAAoBH,EAAS,EACzCI,GAAYC,GAAc,EAG1BC,GAAYC,GAAc,EAC1BC,GAAYC,GAAW,oBAAoB,EAC3CC,GAAYD,GAAW,qBAAqB,EAC5CE,GAAYC,GAAW,EAGvBlB,GAASmB,GAAc,EACvBC,GAAS,SAAS,MAAM,UAAU,QAAQ,EAC5CtB,GAAiB,EACjBuB,GAGEC,GAAS,IAAIC,EACnBC,GAAiB,CAAE,OAAAF,EAAO,CAAC,EAG3B,IAAMG,GAAY,IAAIF,EAGlBG,EAAQ,oBAAoB,GAC9BC,GAAuB,CAAE,UAAArB,GAAW,UAAAM,GAAW,UAAAa,EAAU,CAAC,EACvD,UAAUrB,EAAS,EAzJxB,IAAAwB,KA4JIA,GAAA5B,GAAO,UAAP,YAAA4B,GAAgB,YAAa,QAC/BC,GAAqB,CAAE,UAAAzB,EAAU,CAAC,EAGpC0B,EAAMxB,GAAWE,EAAO,EACrB,KACCuB,GAAM,GAAG,CACX,EACG,UAAU,IAAM,CACfC,GAAU,SAAU,EAAK,EACzBA,GAAU,SAAU,EAAK,CAC3B,CAAC,EAGLtB,GACG,KACCuB,EAAO,CAAC,CAAE,KAAAC,CAAK,IAAMA,IAAS,QAAQ,CACxC,EACG,UAAUC,GAAO,CAChB,OAAQA,EAAI,KAAM,CAGhB,IAAK,IACL,IAAK,IACH,IAAMC,EAAOC,GAAoC,gBAAgB,EAC7D,OAAOD,GAAS,aAClBE,GAAYF,CAAI,EAClB,MAGF,IAAK,IACL,IAAK,IACH,IAAMG,EAAOF,GAAoC,gBAAgB,EAC7D,OAAOE,GAAS,aAClBD,GAAYC,CAAI,EAClB,MAGF,IAAK,QACH,IAAMC,EAASC,GAAiB,EAC5BD,aAAkB,kBACpBA,EAAO,MAAM,CACnB,CACF,CAAC,EAGLE,GAAc,CAAE,UAAA9B,GAAW,UAAAR,EAAU,CAAC,EACtCuC,GAAmB,CAAE,UAAAvC,GAAW,QAAAU,EAAQ,CAAC,EACzC8B,GAAe,CAAE,UAAAxC,EAAU,CAAC,EAC5ByC,GAAgB,CAAE,UAAAjC,GAAW,QAAAE,EAAQ,CAAC,EAGtC,IAAMgC,GAAUC,GAAYC,GAAoB,QAAQ,EAAG,CAAE,UAAApC,EAAU,CAAC,EAClEqC,GAAQ7C,GACX,KACCH,EAAI,IAAM+C,GAAoB,MAAM,CAAC,EACrCE,EAAUC,GAAMC,GAAUD,EAAI,CAAE,UAAAvC,GAAW,QAAAkC,EAAQ,CAAC,CAAC,EACrD5C,EAAY,CAAC,CACf,EAGImD,GAAWvB,EAGf,GAAGwB,GAAqB,SAAS,EAC9B,IAAIH,GAAMI,GAAaJ,EAAI,CAAE,QAAA3C,EAAQ,CAAC,CAAC,EAG1C,GAAG8C,GAAqB,QAAQ,EAC7B,IAAIH,GAAMK,GAAYL,EAAI,CAAE,OAAA7B,EAAO,CAAC,CAAC,EAGxC,GAAGgC,GAAqB,QAAQ,EAC7B,IAAIH,GAAMM,GAAYN,EAAI,CAAE,UAAAvC,GAAW,QAAAkC,GAAS,MAAAG,EAAM,CAAC,CAAC,EAG3D,GAAGK,GAAqB,SAAS,EAC9B,IAAIH,GAAMO,GAAaP,CAAE,CAAC,EAG7B,GAAGG,GAAqB,UAAU,EAC/B,IAAIH,GAAMQ,GAAcR,EAAI,CAAE,UAAA1B,EAAU,CAAC,CAAC,EAG7C,GAAG6B,GAAqB,QAAQ,EAC7B,IAAIH,GAAMS,GAAYT,EAAI,CAAE,OAAA/B,GAAQ,UAAAV,EAAU,CAAC,CAAC,EAGnD,GAAG4C,GAAqB,QAAQ,EAC7B,IAAIH,GAAMU,GAAYV,CAAE,CAAC,CAC9B,EAGMW,GAAWC,EAAM,IAAMjC,EAG3B,GAAGwB,GAAqB,UAAU,EAC/B,IAAIH,GAAMa,GAAcb,CAAE,CAAC,EAG9B,GAAGG,GAAqB,SAAS,EAC9B,IAAIH,GAAMc,GAAad,EAAI,CAAE,UAAAvC,GAAW,QAAAJ,GAAS,OAAAS,EAAO,CAAC,CAAC,EAG7D,GAAGqC,GAAqB,SAAS,EAC9B,IAAIH,GAAMzB,EAAQ,kBAAkB,EACjCwC,GAAoBf,EAAI,CAAE,OAAA/B,GAAQ,UAAAd,EAAU,CAAC,EAC7C6D,CACJ,EAGF,GAAGb,GAAqB,cAAc,EACnC,IAAIH,GAAMiB,GAAiBjB,EAAI,CAAE,UAAAvC,GAAW,QAAAkC,EAAQ,CAAC,CAAC,EAGzD,GAAGQ,GAAqB,SAAS,EAC9B,IAAIH,GAAMA,EAAG,aAAa,cAAc,IAAM,aAC3CkB,GAAGrD,GAAS,IAAMsD,GAAanB,EAAI,CAAE,UAAAvC,GAAW,QAAAkC,GAAS,MAAAG,EAAM,CAAC,CAAC,EACjEoB,GAAGvD,GAAS,IAAMwD,GAAanB,EAAI,CAAE,UAAAvC,GAAW,QAAAkC,GAAS,MAAAG,EAAM,CAAC,CAAC,CACrE,EAGF,GAAGK,GAAqB,MAAM,EAC3B,IAAIH,GAAMoB,GAAUpB,EAAI,CAAE,UAAAvC,GAAW,QAAAkC,EAAQ,CAAC,CAAC,EAGlD,GAAGQ,GAAqB,KAAK,EAC1B,IAAIH,GAAMqB,GAAqBrB,EAAI,CAClC,UAAAvC,GAAW,QAAAkC,GAAS,MAAAG,GAAO,QAAAzC,EAC7B,CAAC,CAAC,EAGJ,GAAG8C,GAAqB,KAAK,EAC1B,IAAIH,GAAMsB,GAAetB,EAAI,CAAE,UAAAvC,GAAW,QAAAkC,GAAS,MAAAG,GAAO,QAAAzC,EAAQ,CAAC,CAAC,CACzE,CAAC,EAGKkE,GAAatE,GAChB,KACC8C,EAAU,IAAMY,EAAQ,EACxBa,GAAUtB,EAAQ,EAClBnD,EAAY,CAAC,CACf,EAGFwE,GAAW,UAAU,EAMrB,OAAO,UAAatE,GACpB,OAAO,UAAaE,GACpB,OAAO,QAAaE,GACpB,OAAO,UAAaE,GACpB,OAAO,UAAaE,GACpB,OAAO,QAAaE,GACpB,OAAO,QAAaE,GACpB,OAAO,OAAaC,GACpB,OAAO,OAAaK,GACpB,OAAO,UAAaG,GACpB,OAAO,WAAaiD",
+  "names": ["require_focus_visible", "__commonJSMin", "exports", "module", "global", "factory", "applyFocusVisiblePolyfill", "scope", "hadKeyboardEvent", "hadFocusVisibleRecently", "hadFocusVisibleRecentlyTimeout", "inputTypesAllowlist", "isValidFocusTarget", "el", "focusTriggersKeyboardModality", "type", "tagName", "addFocusVisibleClass", "removeFocusVisibleClass", "onKeyDown", "e", "onPointerDown", "onFocus", "onBlur", "onVisibilityChange", "addInitialPointerMoveListeners", "onInitialPointerMove", "removeInitialPointerMoveListeners", "event", "error", "require_clipboard", "__commonJSMin", "exports", "module", "root", "factory", "__webpack_modules__", "__unused_webpack_module", "__webpack_exports__", "__webpack_require__", "clipboard", "tiny_emitter", "tiny_emitter_default", "listen", "listen_default", "src_select", "select_default", "command", "type", "err", "ClipboardActionCut", "target", "selectedText", "actions_cut", "createFakeElement", "value", "isRTL", "fakeElement", "yPosition", "fakeCopyAction", "options", "ClipboardActionCopy", "actions_copy", "_typeof", "obj", "ClipboardActionDefault", "_options$action", "action", "container", "text", "actions_default", "clipboard_typeof", "_classCallCheck", "instance", "Constructor", "_defineProperties", "props", "i", "descriptor", "_createClass", "protoProps", "staticProps", "_inherits", "subClass", "superClass", "_setPrototypeOf", "o", "p", "_createSuper", "Derived", "hasNativeReflectConstruct", "_isNativeReflectConstruct", "Super", "_getPrototypeOf", "result", "NewTarget", "_possibleConstructorReturn", "self", "call", "_assertThisInitialized", "e", "getAttributeValue", "suffix", "element", "attribute", "Clipboard", "_Emitter", "_super", "trigger", "_this", "_this2", "selector", "actions", "support", "DOCUMENT_NODE_TYPE", "proto", "closest", "__unused_webpack_exports", "_delegate", "callback", "useCapture", "listenerFn", "listener", "delegate", "elements", "is", "listenNode", "listenNodeList", "listenSelector", "node", "nodeList", "select", "isReadOnly", "selection", "range", "E", "name", "ctx", "data", "evtArr", "len", "evts", "liveEvents", "__webpack_module_cache__", "moduleId", "getter", "definition", "key", "prop", "require_escape_html", "__commonJSMin", "exports", "module", "matchHtmlRegExp", "escapeHtml", "string", "str", "match", "escape", "html", "index", "lastIndex", "import_focus_visible", "extendStatics", "d", "b", "p", "__extends", "__", "__awaiter", "thisArg", "_arguments", "P", "generator", "adopt", "value", "resolve", "reject", "fulfilled", "step", "e", "rejected", "result", "__generator", "body", "_", "t", "f", "y", "g", "verb", "n", "v", "op", "__values", "o", "s", "m", "i", "__read", "n", "r", "ar", "e", "error", "__spreadArray", "to", "from", "pack", "i", "l", "ar", "__await", "v", "__asyncGenerator", "thisArg", "_arguments", "generator", "g", "q", "verb", "n", "a", "b", "resume", "step", "e", "settle", "r", "fulfill", "reject", "value", "f", "__asyncValues", "o", "m", "i", "__values", "verb", "n", "v", "resolve", "reject", "settle", "d", "isFunction", "value", "createErrorClass", "createImpl", "_super", "instance", "ctorFunc", "UnsubscriptionError", "createErrorClass", "_super", "errors", "err", "i", "arrRemove", "arr", "item", "index", "Subscription", "initialTeardown", "errors", "_parentage", "_parentage_1", "__values", "_parentage_1_1", "parent_1", "initialFinalizer", "isFunction", "e", "UnsubscriptionError", "_finalizers", "_finalizers_1", "_finalizers_1_1", "finalizer", "execFinalizer", "err", "__spreadArray", "__read", "teardown", "_a", "parent", "arrRemove", "empty", "EMPTY_SUBSCRIPTION", "Subscription", "isSubscription", "value", "isFunction", "execFinalizer", "finalizer", "config", "timeoutProvider", "handler", "timeout", "args", "_i", "delegate", "__spreadArray", "__read", "handle", "reportUnhandledError", "err", "timeoutProvider", "onUnhandledError", "config", "noop", "COMPLETE_NOTIFICATION", "createNotification", "errorNotification", "error", "nextNotification", "value", "kind", "context", "errorContext", "cb", "config", "isRoot", "_a", "errorThrown", "error", "captureError", "err", "Subscriber", "_super", "__extends", "destination", "_this", "isSubscription", "EMPTY_OBSERVER", "next", "error", "complete", "SafeSubscriber", "value", "handleStoppedNotification", "nextNotification", "err", "errorNotification", "COMPLETE_NOTIFICATION", "Subscription", "_bind", "bind", "fn", "thisArg", "ConsumerObserver", "partialObserver", "value", "error", "handleUnhandledError", "err", "SafeSubscriber", "_super", "__extends", "observerOrNext", "complete", "_this", "isFunction", "context_1", "config", "Subscriber", "handleUnhandledError", "error", "config", "captureError", "reportUnhandledError", "defaultErrorHandler", "err", "handleStoppedNotification", "notification", "subscriber", "onStoppedNotification", "timeoutProvider", "EMPTY_OBSERVER", "noop", "observable", "identity", "x", "pipe", "fns", "_i", "pipeFromArray", "identity", "input", "prev", "fn", "Observable", "subscribe", "operator", "observable", "observerOrNext", "error", "complete", "_this", "subscriber", "isSubscriber", "SafeSubscriber", "errorContext", "_a", "source", "sink", "err", "next", "promiseCtor", "getPromiseCtor", "resolve", "reject", "value", "operations", "_i", "pipeFromArray", "x", "getPromiseCtor", "promiseCtor", "_a", "config", "isObserver", "value", "isFunction", "isSubscriber", "Subscriber", "isSubscription", "hasLift", "source", "isFunction", "operate", "init", "liftedSource", "err", "createOperatorSubscriber", "destination", "onNext", "onComplete", "onError", "onFinalize", "OperatorSubscriber", "_super", "__extends", "shouldUnsubscribe", "_this", "value", "err", "closed_1", "_a", "Subscriber", "animationFrameProvider", "callback", "request", "cancel", "delegate", "handle", "timestamp", "Subscription", "args", "_i", "__spreadArray", "__read", "ObjectUnsubscribedError", "createErrorClass", "_super", "Subject", "_super", "__extends", "_this", "operator", "subject", "AnonymousSubject", "ObjectUnsubscribedError", "value", "errorContext", "_b", "__values", "_c", "observer", "err", "observers", "_a", "subscriber", "hasError", "isStopped", "EMPTY_SUBSCRIPTION", "Subscription", "arrRemove", "thrownError", "observable", "Observable", "destination", "source", "AnonymousSubject", "_super", "__extends", "destination", "source", "_this", "value", "_b", "_a", "err", "subscriber", "EMPTY_SUBSCRIPTION", "Subject", "BehaviorSubject", "_super", "__extends", "_value", "_this", "subscriber", "subscription", "_a", "hasError", "thrownError", "value", "Subject", "dateTimestampProvider", "ReplaySubject", "_super", "__extends", "_bufferSize", "_windowTime", "_timestampProvider", "dateTimestampProvider", "_this", "value", "_a", "isStopped", "_buffer", "_infiniteTimeWindow", "subscriber", "subscription", "copy", "i", "adjustedBufferSize", "now", "last", "Subject", "Action", "_super", "__extends", "scheduler", "work", "state", "delay", "Subscription", "intervalProvider", "handler", "timeout", "args", "_i", "delegate", "__spreadArray", "__read", "handle", "AsyncAction", "_super", "__extends", "scheduler", "work", "_this", "state", "delay", "id", "_a", "_id", "intervalProvider", "_scheduler", "error", "_delay", "errored", "errorValue", "e", "actions", "arrRemove", "Action", "Scheduler", "schedulerActionCtor", "now", "work", "delay", "state", "dateTimestampProvider", "AsyncScheduler", "_super", "__extends", "SchedulerAction", "now", "Scheduler", "_this", "action", "actions", "error", "asyncScheduler", "AsyncScheduler", "AsyncAction", "async", "QueueAction", "_super", "__extends", "scheduler", "work", "_this", "state", "delay", "id", "AsyncAction", "QueueScheduler", "_super", "__extends", "AsyncScheduler", "queueScheduler", "QueueScheduler", "QueueAction", "AnimationFrameAction", "_super", "__extends", "scheduler", "work", "_this", "id", "delay", "animationFrameProvider", "actions", "_a", "AsyncAction", "AnimationFrameScheduler", "_super", "__extends", "action", "flushId", "actions", "error", "AsyncScheduler", "animationFrameScheduler", "AnimationFrameScheduler", "AnimationFrameAction", "EMPTY", "Observable", "subscriber", "isScheduler", "value", "isFunction", "last", "arr", "popResultSelector", "args", "isFunction", "popScheduler", "isScheduler", "popNumber", "defaultValue", "isArrayLike", "x", "isPromise", "value", "isFunction", "isInteropObservable", "input", "isFunction", "observable", "isAsyncIterable", "obj", "isFunction", "createInvalidObservableTypeError", "input", "getSymbolIterator", "iterator", "isIterable", "input", "isFunction", "iterator", "readableStreamLikeToAsyncGenerator", "readableStream", "reader", "__await", "_a", "_b", "value", "done", "isReadableStreamLike", "obj", "isFunction", "innerFrom", "input", "Observable", "isInteropObservable", "fromInteropObservable", "isArrayLike", "fromArrayLike", "isPromise", "fromPromise", "isAsyncIterable", "fromAsyncIterable", "isIterable", "fromIterable", "isReadableStreamLike", "fromReadableStreamLike", "createInvalidObservableTypeError", "obj", "subscriber", "obs", "observable", "isFunction", "array", "i", "promise", "value", "err", "reportUnhandledError", "iterable", "iterable_1", "__values", "iterable_1_1", "asyncIterable", "process", "readableStream", "readableStreamLikeToAsyncGenerator", "asyncIterable_1", "__asyncValues", "asyncIterable_1_1", "executeSchedule", "parentSubscription", "scheduler", "work", "delay", "repeat", "scheduleSubscription", "observeOn", "scheduler", "delay", "operate", "source", "subscriber", "createOperatorSubscriber", "value", "executeSchedule", "err", "subscribeOn", "scheduler", "delay", "operate", "source", "subscriber", "scheduleObservable", "input", "scheduler", "innerFrom", "subscribeOn", "observeOn", "schedulePromise", "input", "scheduler", "innerFrom", "subscribeOn", "observeOn", "scheduleArray", "input", "scheduler", "Observable", "subscriber", "i", "scheduleIterable", "input", "scheduler", "Observable", "subscriber", "iterator", "executeSchedule", "value", "done", "_a", "err", "isFunction", "scheduleAsyncIterable", "input", "scheduler", "Observable", "subscriber", "executeSchedule", "iterator", "result", "scheduleReadableStreamLike", "input", "scheduler", "scheduleAsyncIterable", "readableStreamLikeToAsyncGenerator", "scheduled", "input", "scheduler", "isInteropObservable", "scheduleObservable", "isArrayLike", "scheduleArray", "isPromise", "schedulePromise", "isAsyncIterable", "scheduleAsyncIterable", "isIterable", "scheduleIterable", "isReadableStreamLike", "scheduleReadableStreamLike", "createInvalidObservableTypeError", "from", "input", "scheduler", "scheduled", "innerFrom", "of", "args", "_i", "scheduler", "popScheduler", "from", "throwError", "errorOrErrorFactory", "scheduler", "errorFactory", "isFunction", "init", "subscriber", "Observable", "EmptyError", "createErrorClass", "_super", "isValidDate", "value", "map", "project", "thisArg", "operate", "source", "subscriber", "index", "createOperatorSubscriber", "value", "isArray", "callOrApply", "fn", "args", "__spreadArray", "__read", "mapOneOrManyArgs", "map", "isArray", "getPrototypeOf", "objectProto", "getKeys", "argsArgArrayOrObject", "args", "first_1", "isPOJO", "keys", "key", "obj", "createObject", "keys", "values", "result", "key", "i", "combineLatest", "args", "_i", "scheduler", "popScheduler", "resultSelector", "popResultSelector", "_a", "argsArgArrayOrObject", "observables", "keys", "from", "result", "Observable", "combineLatestInit", "values", "createObject", "identity", "mapOneOrManyArgs", "valueTransform", "subscriber", "maybeSchedule", "length", "active", "remainingFirstValues", "i", "source", "hasFirstValue", "createOperatorSubscriber", "value", "execute", "subscription", "executeSchedule", "mergeInternals", "source", "subscriber", "project", "concurrent", "onBeforeNext", "expand", "innerSubScheduler", "additionalFinalizer", "buffer", "active", "index", "isComplete", "checkComplete", "outerNext", "value", "doInnerSub", "innerComplete", "innerFrom", "createOperatorSubscriber", "innerValue", "bufferedValue", "executeSchedule", "err", "mergeMap", "project", "resultSelector", "concurrent", "isFunction", "a", "i", "map", "b", "ii", "innerFrom", "operate", "source", "subscriber", "mergeInternals", "mergeAll", "concurrent", "mergeMap", "identity", "concatAll", "mergeAll", "concat", "args", "_i", "concatAll", "from", "popScheduler", "defer", "observableFactory", "Observable", "subscriber", "innerFrom", "nodeEventEmitterMethods", "eventTargetMethods", "jqueryMethods", "fromEvent", "target", "eventName", "options", "resultSelector", "isFunction", "mapOneOrManyArgs", "_a", "__read", "isEventTarget", "methodName", "handler", "isNodeStyleEventEmitter", "toCommonHandlerRegistry", "isJQueryStyleEventEmitter", "add", "remove", "isArrayLike", "mergeMap", "subTarget", "innerFrom", "Observable", "subscriber", "args", "_i", "fromEventPattern", "addHandler", "removeHandler", "resultSelector", "mapOneOrManyArgs", "Observable", "subscriber", "handler", "e", "_i", "retValue", "isFunction", "timer", "dueTime", "intervalOrScheduler", "scheduler", "async", "intervalDuration", "isScheduler", "Observable", "subscriber", "due", "isValidDate", "n", "merge", "args", "_i", "scheduler", "popScheduler", "concurrent", "popNumber", "sources", "innerFrom", "mergeAll", "from", "EMPTY", "NEVER", "Observable", "noop", "isArray", "argsOrArgArray", "args", "filter", "predicate", "thisArg", "operate", "source", "subscriber", "index", "createOperatorSubscriber", "value", "zip", "args", "_i", "resultSelector", "popResultSelector", "sources", "argsOrArgArray", "Observable", "subscriber", "buffers", "completed", "sourceIndex", "innerFrom", "createOperatorSubscriber", "value", "buffer", "result", "__spreadArray", "__read", "i", "EMPTY", "audit", "durationSelector", "operate", "source", "subscriber", "hasValue", "lastValue", "durationSubscriber", "isComplete", "endDuration", "value", "cleanupDuration", "createOperatorSubscriber", "innerFrom", "auditTime", "duration", "scheduler", "asyncScheduler", "audit", "timer", "bufferCount", "bufferSize", "startBufferEvery", "operate", "source", "subscriber", "buffers", "count", "createOperatorSubscriber", "value", "toEmit", "buffers_1", "__values", "buffers_1_1", "buffer", "toEmit_1", "toEmit_1_1", "arrRemove", "buffers_2", "buffers_2_1", "catchError", "selector", "operate", "source", "subscriber", "innerSub", "syncUnsub", "handledResult", "createOperatorSubscriber", "err", "innerFrom", "scanInternals", "accumulator", "seed", "hasSeed", "emitOnNext", "emitBeforeComplete", "source", "subscriber", "hasState", "state", "index", "createOperatorSubscriber", "value", "i", "combineLatest", "args", "_i", "resultSelector", "popResultSelector", "pipe", "__spreadArray", "__read", "mapOneOrManyArgs", "operate", "source", "subscriber", "combineLatestInit", "argsOrArgArray", "combineLatestWith", "otherSources", "_i", "combineLatest", "__spreadArray", "__read", "debounce", "durationSelector", "operate", "source", "subscriber", "hasValue", "lastValue", "durationSubscriber", "emit", "value", "createOperatorSubscriber", "noop", "innerFrom", "debounceTime", "dueTime", "scheduler", "asyncScheduler", "operate", "source", "subscriber", "activeTask", "lastValue", "lastTime", "emit", "value", "emitWhenIdle", "targetTime", "now", "createOperatorSubscriber", "defaultIfEmpty", "defaultValue", "operate", "source", "subscriber", "hasValue", "createOperatorSubscriber", "value", "take", "count", "EMPTY", "operate", "source", "subscriber", "seen", "createOperatorSubscriber", "value", "ignoreElements", "operate", "source", "subscriber", "createOperatorSubscriber", "noop", "mapTo", "value", "map", "delayWhen", "delayDurationSelector", "subscriptionDelay", "source", "concat", "take", "ignoreElements", "mergeMap", "value", "index", "innerFrom", "mapTo", "delay", "due", "scheduler", "asyncScheduler", "duration", "timer", "delayWhen", "distinctUntilChanged", "comparator", "keySelector", "identity", "defaultCompare", "operate", "source", "subscriber", "previousKey", "first", "createOperatorSubscriber", "value", "currentKey", "a", "b", "distinctUntilKeyChanged", "key", "compare", "distinctUntilChanged", "x", "y", "throwIfEmpty", "errorFactory", "defaultErrorFactory", "operate", "source", "subscriber", "hasValue", "createOperatorSubscriber", "value", "EmptyError", "endWith", "values", "_i", "source", "concat", "of", "__spreadArray", "__read", "finalize", "callback", "operate", "source", "subscriber", "first", "predicate", "defaultValue", "hasDefaultValue", "source", "filter", "v", "identity", "take", "defaultIfEmpty", "throwIfEmpty", "EmptyError", "takeLast", "count", "EMPTY", "operate", "source", "subscriber", "buffer", "createOperatorSubscriber", "value", "buffer_1", "__values", "buffer_1_1", "merge", "args", "_i", "scheduler", "popScheduler", "concurrent", "popNumber", "argsOrArgArray", "operate", "source", "subscriber", "mergeAll", "from", "__spreadArray", "__read", "mergeWith", "otherSources", "_i", "merge", "__spreadArray", "__read", "repeat", "countOrConfig", "count", "delay", "_a", "EMPTY", "operate", "source", "subscriber", "soFar", "sourceSub", "resubscribe", "notifier", "timer", "innerFrom", "notifierSubscriber_1", "createOperatorSubscriber", "subscribeToSource", "syncUnsub", "scan", "accumulator", "seed", "operate", "scanInternals", "share", "options", "_a", "connector", "Subject", "_b", "resetOnError", "_c", "resetOnComplete", "_d", "resetOnRefCountZero", "wrapperSource", "connection", "resetConnection", "subject", "refCount", "hasCompleted", "hasErrored", "cancelReset", "reset", "resetAndUnsubscribe", "conn", "operate", "source", "subscriber", "dest", "handleReset", "SafeSubscriber", "value", "err", "innerFrom", "on", "args", "_i", "onSubscriber", "__spreadArray", "__read", "shareReplay", "configOrBufferSize", "windowTime", "scheduler", "bufferSize", "refCount", "_a", "_b", "_c", "share", "ReplaySubject", "skip", "count", "filter", "_", "index", "skipUntil", "notifier", "operate", "source", "subscriber", "taking", "skipSubscriber", "createOperatorSubscriber", "noop", "innerFrom", "value", "startWith", "values", "_i", "scheduler", "popScheduler", "operate", "source", "subscriber", "concat", "switchMap", "project", "resultSelector", "operate", "source", "subscriber", "innerSubscriber", "index", "isComplete", "checkComplete", "createOperatorSubscriber", "value", "innerIndex", "outerIndex", "innerFrom", "innerValue", "takeUntil", "notifier", "operate", "source", "subscriber", "innerFrom", "createOperatorSubscriber", "noop", "takeWhile", "predicate", "inclusive", "operate", "source", "subscriber", "index", "createOperatorSubscriber", "value", "result", "tap", "observerOrNext", "error", "complete", "tapObserver", "isFunction", "operate", "source", "subscriber", "_a", "isUnsub", "createOperatorSubscriber", "value", "err", "_b", "identity", "throttle", "durationSelector", "config", "operate", "source", "subscriber", "_a", "_b", "leading", "_c", "trailing", "hasValue", "sendValue", "throttled", "isComplete", "endThrottling", "send", "cleanupThrottling", "startThrottle", "value", "innerFrom", "createOperatorSubscriber", "throttleTime", "duration", "scheduler", "config", "asyncScheduler", "duration$", "timer", "throttle", "withLatestFrom", "inputs", "_i", "project", "popResultSelector", "operate", "source", "subscriber", "len", "otherValues", "hasValue", "ready", "i", "innerFrom", "createOperatorSubscriber", "value", "identity", "noop", "values", "__spreadArray", "__read", "zip", "sources", "_i", "operate", "source", "subscriber", "__spreadArray", "__read", "zipWith", "otherInputs", "_i", "zip", "__spreadArray", "__read", "watchDocument", "document$", "ReplaySubject", "fromEvent", "getElements", "selector", "node", "getElement", "el", "getOptionalElement", "getActiveElement", "_a", "_b", "_c", "_d", "observer$", "merge", "fromEvent", "debounceTime", "startWith", "map", "getActiveElement", "shareReplay", "watchElementFocus", "el", "active", "distinctUntilChanged", "watchElementHover", "el", "timeout", "defer", "merge", "fromEvent", "map", "debounce", "active", "timer", "identity", "startWith", "appendChild", "el", "child", "node", "h", "tag", "attributes", "children", "attr", "round", "value", "digits", "watchScript", "src", "script", "h", "defer", "merge", "fromEvent", "switchMap", "throwError", "map", "finalize", "take", "entry$", "Subject", "observer$", "defer", "watchScript", "of", "map", "entries", "entry", "switchMap", "observer", "merge", "NEVER", "finalize", "shareReplay", "getElementSize", "el", "watchElementSize", "target", "tap", "filter", "startWith", "getElementContentSize", "el", "getElementContainer", "parent", "getElementContainers", "containers", "getElementOffset", "el", "getElementOffsetAbsolute", "rect", "watchElementOffset", "merge", "fromEvent", "auditTime", "animationFrameScheduler", "map", "startWith", "getElementContentOffset", "el", "watchElementContentOffset", "merge", "fromEvent", "auditTime", "animationFrameScheduler", "map", "startWith", "entry$", "Subject", "observer$", "defer", "of", "entries", "entry", "switchMap", "observer", "merge", "NEVER", "finalize", "shareReplay", "watchElementVisibility", "el", "tap", "filter", "target", "map", "isIntersecting", "watchElementBoundary", "threshold", "watchElementContentOffset", "y", "visible", "getElementSize", "content", "getElementContentSize", "distinctUntilChanged", "toggles", "getElement", "getToggle", "name", "setToggle", "value", "watchToggle", "el", "fromEvent", "map", "startWith", "isSusceptibleToKeyboard", "el", "type", "watchComposition", "merge", "fromEvent", "map", "startWith", "watchKeyboard", "keyboard$", "filter", "ev", "getToggle", "mode", "active", "getActiveElement", "share", "switchMap", "EMPTY", "getLocation", "setLocation", "url", "navigate", "feature", "el", "h", "watchLocation", "Subject", "getLocationHash", "setLocationHash", "hash", "el", "h", "ev", "watchLocationHash", "location$", "merge", "fromEvent", "map", "startWith", "filter", "shareReplay", "watchLocationTarget", "id", "getOptionalElement", "watchMedia", "query", "media", "fromEventPattern", "next", "startWith", "watchPrint", "merge", "fromEvent", "map", "at", "query$", "factory", "switchMap", "active", "EMPTY", "request", "url", "options", "Observable", "observer", "req", "event", "_a", "length", "requestJSON", "switchMap", "res", "map", "body", "shareReplay", "requestHTML", "dom", "requestXML", "getViewportOffset", "watchViewportOffset", "merge", "fromEvent", "map", "startWith", "getViewportSize", "watchViewportSize", "fromEvent", "map", "startWith", "watchViewport", "combineLatest", "watchViewportOffset", "watchViewportSize", "map", "offset", "size", "shareReplay", "watchViewportAt", "el", "viewport$", "header$", "size$", "distinctUntilKeyChanged", "offset$", "combineLatest", "map", "getElementOffset", "height", "offset", "size", "x", "y", "recv", "worker", "fromEvent", "ev", "send", "send$", "Subject", "data", "watchWorker", "url", "recv$", "worker$", "done$", "ignoreElements", "endWith", "mergeWith", "takeUntil", "share", "script", "getElement", "config", "getLocation", "configuration", "feature", "flag", "translation", "key", "value", "getComponentElement", "type", "node", "getElement", "getComponentElements", "getElements", "watchAnnounce", "el", "button", "getElement", "fromEvent", "map", "content", "mountAnnounce", "feature", "EMPTY", "defer", "push$", "Subject", "hash", "tap", "state", "finalize", "__spreadValues", "watchConsent", "el", "target$", "map", "target", "mountConsent", "options", "internal$", "Subject", "hidden", "tap", "state", "finalize", "__spreadValues", "renderTooltip", "id", "style", "h", "renderInlineTooltip2", "children", "renderAnnotation", "id", "prefix", "anchor", "h", "renderTooltip", "renderClipboardButton", "id", "h", "translation", "renderSearchDocument", "document", "flag", "parent", "teaser", "missing", "key", "list", "h", "config", "configuration", "url", "feature", "match", "highlight", "value", "tags", "tag", "type", "translation", "renderSearchResultItem", "result", "threshold", "docs", "doc", "article", "index", "best", "more", "children", "section", "renderSourceFacts", "facts", "h", "key", "value", "round", "renderTabbedControl", "type", "classes", "h", "renderTable", "table", "h", "renderVersion", "version", "_a", "config", "configuration", "url", "h", "renderVersionSelector", "versions", "active", "translation", "sequence", "watchTooltip2", "el", "active$", "combineLatest", "watchElementFocus", "watchElementHover", "map", "focus", "hover", "distinctUntilChanged", "offset$", "defer", "getElementContainers", "mergeMap", "watchElementContentOffset", "throttleTime", "getElementOffsetAbsolute", "first", "active", "switchMap", "offset", "share", "mountTooltip2", "dependencies", "content$", "viewport$", "id", "push$", "Subject", "show$", "BehaviorSubject", "ignoreElements", "endWith", "node$", "debounce", "timer", "queueScheduler", "EMPTY", "tap", "node", "startWith", "states", "origin$", "filter", "withLatestFrom", "_", "size", "host", "x", "height", "getElementSize", "origin", "getElement", "observeOn", "animationFrameScheduler", "state", "finalize", "__spreadValues", "mountInlineTooltip2", "container", "Observable", "observer", "title", "renderInlineTooltip2", "watchAnnotation", "el", "container", "offset$", "defer", "combineLatest", "watchElementOffset", "watchElementContentOffset", "map", "x", "y", "scroll", "width", "height", "getElementSize", "watchElementFocus", "switchMap", "active", "offset", "take", "mountAnnotation", "target$", "tooltip", "index", "push$", "Subject", "done$", "ignoreElements", "endWith", "watchElementVisibility", "takeUntil", "visible", "merge", "filter", "debounceTime", "auditTime", "animationFrameScheduler", "throttleTime", "origin", "fromEvent", "ev", "withLatestFrom", "_a", "parent", "getActiveElement", "target", "delay", "tap", "state", "finalize", "__spreadValues", "findHosts", "container", "getElements", "findMarkers", "markers", "el", "nodes", "it", "node", "text", "match", "id", "force", "marker", "swap", "source", "target", "mountAnnotationList", "target$", "print$", "parent", "prefix", "annotations", "getOptionalElement", "renderAnnotation", "EMPTY", "defer", "push$", "Subject", "done$", "ignoreElements", "endWith", "pairs", "annotation", "getElement", "takeUntil", "active", "inner", "child", "merge", "mountAnnotation", "finalize", "share", "findList", "el", "sibling", "mountAnnotationBlock", "options", "defer", "list", "mountAnnotationList", "EMPTY", "import_clipboard", "sequence", "findCandidateList", "el", "sibling", "watchCodeBlock", "watchElementSize", "map", "width", "getElementContentSize", "distinctUntilKeyChanged", "mountCodeBlock", "options", "hover", "factory$", "defer", "push$", "Subject", "done$", "takeLast", "scrollable", "content$", "ClipboardJS", "feature", "parent", "button", "renderClipboardButton", "mountInlineTooltip2", "container", "list", "annotations$", "mountAnnotationList", "takeUntil", "height", "distinctUntilChanged", "switchMap", "active", "EMPTY", "getElements", "tap", "state", "finalize", "__spreadValues", "mergeWith", "watchElementVisibility", "filter", "visible", "take", "watchDetails", "el", "target$", "print$", "open", "merge", "map", "target", "filter", "details", "active", "tap", "mountDetails", "options", "defer", "push$", "Subject", "action", "reveal", "state", "finalize", "__spreadValues", "mermaid_default", "mermaid$", "sequence", "fetchScripts", "watchScript", "of", "mountMermaid", "el", "tap", "mermaid_default", "map", "shareReplay", "__async", "id", "host", "h", "text", "svg", "fn", "shadow", "sentinel", "h", "mountDataTable", "el", "renderTable", "of", "watchContentTabs", "inputs", "initial", "input", "merge", "fromEvent", "map", "getElement", "startWith", "active", "mountContentTabs", "el", "viewport$", "target$", "container", "getElements", "prev", "renderTabbedControl", "next", "defer", "push$", "Subject", "done$", "ignoreElements", "endWith", "combineLatest", "watchElementSize", "watchElementVisibility", "takeUntil", "auditTime", "animationFrameScheduler", "size", "offset", "getElementOffset", "width", "getElementSize", "content", "getElementContentOffset", "watchElementContentOffset", "getElementContentSize", "direction", "filter", "label", "h", "ev", "tap", "feature", "skip", "withLatestFrom", "tab", "y", "set", "tabs", "media", "state", "finalize", "__spreadValues", "subscribeOn", "asyncScheduler", "mountContent", "el", "viewport$", "target$", "print$", "merge", "getElements", "child", "mountAnnotationBlock", "mountCodeBlock", "mountMermaid", "mountDataTable", "mountDetails", "mountContentTabs", "feature", "mountInlineTooltip2", "watchDialog", "_el", "alert$", "switchMap", "message", "merge", "of", "delay", "map", "active", "mountDialog", "el", "options", "inner", "getElement", "defer", "push$", "Subject", "tap", "state", "finalize", "__spreadValues", "sequence", "watchTooltip", "el", "host", "width", "getElementSize", "container", "getElementContainer", "scroll$", "watchElementContentOffset", "of", "active$", "merge", "watchElementFocus", "watchElementHover", "distinctUntilChanged", "combineLatest", "map", "active", "scroll", "x", "y", "getElementOffset", "size", "table", "mountTooltip", "title", "EMPTY", "id", "tooltip", "renderTooltip", "typeset", "getElement", "defer", "push$", "Subject", "offset", "filter", "debounceTime", "auditTime", "animationFrameScheduler", "throttleTime", "origin", "tap", "state", "finalize", "__spreadValues", "subscribeOn", "asyncScheduler", "isHidden", "viewport$", "feature", "of", "direction$", "map", "y", "bufferCount", "a", "b", "distinctUntilKeyChanged", "hidden$", "combineLatest", "filter", "offset", "direction", "distinctUntilChanged", "search$", "watchToggle", "search", "switchMap", "active", "startWith", "watchHeader", "el", "options", "defer", "watchElementSize", "height", "hidden", "shareReplay", "mountHeader", "header$", "main$", "push$", "Subject", "done$", "ignoreElements", "endWith", "combineLatestWith", "tooltips", "from", "getElements", "mergeMap", "child", "mountTooltip", "takeUntil", "state", "__spreadValues", "mergeWith", "watchHeaderTitle", "el", "viewport$", "header$", "watchViewportAt", "map", "y", "height", "getElementSize", "distinctUntilKeyChanged", "mountHeaderTitle", "options", "defer", "push$", "Subject", "active", "heading", "getOptionalElement", "EMPTY", "tap", "state", "finalize", "__spreadValues", "watchMain", "el", "viewport$", "header$", "adjust$", "map", "height", "distinctUntilChanged", "border$", "switchMap", "watchElementSize", "distinctUntilKeyChanged", "combineLatest", "header", "top", "bottom", "y", "a", "b", "watchPalette", "inputs", "current", "input", "index", "of", "mergeMap", "fromEvent", "map", "startWith", "shareReplay", "mountPalette", "el", "getElements", "meta", "h", "scheme", "media$", "watchMedia", "defer", "push$", "Subject", "palette", "media", "key", "value", "label", "filter", "ev", "withLatestFrom", "_", "header", "getComponentElement", "style", "color", "observeOn", "asyncScheduler", "takeUntil", "skip", "repeat", "tap", "state", "finalize", "__spreadValues", "mountProgress", "el", "progress$", "defer", "push$", "Subject", "value", "tap", "finalize", "map", "import_clipboard", "extract", "el", "copy", "text", "setupClipboardJS", "alert$", "ClipboardJS", "Observable", "subscriber", "getElement", "ev", "tap", "map", "translation", "resolve", "url", "base", "extract", "document", "sitemap", "el", "getElements", "getElement", "links", "link", "href", "fetchSitemap", "requestXML", "map", "catchError", "of", "handle", "ev", "sitemap", "EMPTY", "el", "url", "of", "head", "document", "tags", "getElements", "resolve", "key", "value", "inject", "next", "selector", "feature", "source", "getOptionalElement", "target", "html", "name", "container", "getComponentElement", "concat", "switchMap", "script", "Observable", "observer", "ignoreElements", "endWith", "setupInstantNavigation", "location$", "viewport$", "progress$", "config", "configuration", "sitemap$", "fetchSitemap", "instant$", "fromEvent", "combineLatestWith", "share", "history$", "map", "getLocation", "withLatestFrom", "offset", "merge", "document$", "distinctUntilKeyChanged", "requestHTML", "catchError", "setLocation", "_", "distinctUntilChanged", "a", "b", "tap", "_a", "_b", "setLocationHash", "debounceTime", "import_escape_html", "setupSearchHighlighter", "config", "regex", "term", "separator", "highlight", "_", "data", "query", "match", "value", "escapeHTML", "isSearchReadyMessage", "message", "isSearchResultMessage", "setupSearchWorker", "url", "index$", "worker$", "watchWorker", "merge", "of", "watchToggle", "first", "active", "switchMap", "config", "docs", "feature", "setupVersionSelector", "document$", "config", "configuration", "versions$", "requestJSON", "catchError", "EMPTY", "current$", "map", "versions", "current", "version", "aliases", "switchMap", "urls", "fromEvent", "filter", "ev", "withLatestFrom", "el", "url", "of", "fetchSitemap", "sitemap", "path", "getLocation", "setLocation", "combineLatest", "getElement", "renderVersionSelector", "_a", "outdated", "ignored", "main", "ignore", "warning", "getComponentElements", "watchSearchQuery", "el", "worker$", "searchParams", "getLocation", "setToggle", "watchToggle", "first", "active", "url", "focus$", "watchElementFocus", "value$", "merge", "isSearchReadyMessage", "fromEvent", "map", "distinctUntilChanged", "combineLatest", "value", "focus", "shareReplay", "mountSearchQuery", "push$", "Subject", "done$", "ignoreElements", "endWith", "_", "query", "distinctUntilKeyChanged", "takeUntil", "label", "getElement", "tap", "state", "finalize", "__spreadValues", "mountSearchResult", "el", "worker$", "query$", "push$", "Subject", "boundary$", "watchElementBoundary", "filter", "container", "meta", "getElement", "list", "watchToggle", "active", "withLatestFrom", "skipUntil", "first", "isSearchReadyMessage", "items", "value", "translation", "count", "round", "render$", "tap", "switchMap", "merge", "of", "bufferCount", "zipWith", "chunk", "map", "renderSearchResultItem", "share", "item", "mergeMap", "details", "getOptionalElement", "EMPTY", "fromEvent", "takeUntil", "isSearchResultMessage", "data", "state", "finalize", "__spreadValues", "watchSearchShare", "_el", "query$", "map", "value", "url", "getLocation", "mountSearchShare", "el", "options", "push$", "Subject", "done$", "ignoreElements", "endWith", "fromEvent", "takeUntil", "ev", "tap", "state", "finalize", "__spreadValues", "mountSearchSuggest", "el", "worker$", "keyboard$", "push$", "Subject", "query", "getComponentElement", "query$", "merge", "fromEvent", "observeOn", "asyncScheduler", "map", "distinctUntilChanged", "combineLatestWith", "suggest", "value", "words", "last", "filter", "mode", "key", "isSearchResultMessage", "data", "tap", "state", "finalize", "mountSearch", "el", "index$", "keyboard$", "config", "configuration", "worker$", "setupSearchWorker", "query", "getComponentElement", "result", "fromEvent", "filter", "target", "setToggle", "mode", "key", "active", "getActiveElement", "anchors", "anchor", "getElements", "article", "best", "a", "b", "els", "i", "query$", "mountSearchQuery", "merge", "mountSearchResult", "mergeWith", "getComponentElements", "child", "mountSearchShare", "mountSearchSuggest", "err", "NEVER", "mountSearchHiglight", "el", "index$", "location$", "combineLatest", "startWith", "getLocation", "filter", "url", "map", "index", "setupSearchHighlighter", "fn", "_a", "nodes", "it", "node", "original", "replaced", "text", "childNodes", "h", "watchSidebar", "el", "viewport$", "main$", "parent", "adjust", "combineLatest", "map", "offset", "height", "y", "distinctUntilChanged", "a", "b", "mountSidebar", "_a", "_b", "header$", "options", "__objRest", "inner", "getElement", "getElementOffset", "defer", "push$", "Subject", "done$", "ignoreElements", "endWith", "next$", "auditTime", "animationFrameScheduler", "withLatestFrom", "first", "item", "getElements", "container", "getElementSize", "from", "mergeMap", "label", "fromEvent", "observeOn", "asyncScheduler", "takeUntil", "input", "tap", "state", "finalize", "__spreadValues", "fetchSourceFactsFromGitHub", "user", "repo", "url", "zip", "requestJSON", "catchError", "EMPTY", "map", "release", "defaultIfEmpty", "info", "__spreadValues", "fetchSourceFactsFromGitLab", "base", "project", "url", "requestJSON", "catchError", "EMPTY", "map", "star_count", "forks_count", "defaultIfEmpty", "fetchSourceFacts", "url", "match", "user", "repo", "fetchSourceFactsFromGitHub", "base", "slug", "fetchSourceFactsFromGitLab", "EMPTY", "fetch$", "watchSource", "el", "defer", "cached", "of", "getComponentElements", "consent", "EMPTY", "fetchSourceFacts", "tap", "facts", "catchError", "filter", "map", "shareReplay", "mountSource", "inner", "getElement", "push$", "Subject", "renderSourceFacts", "state", "finalize", "__spreadValues", "watchTabs", "el", "viewport$", "header$", "watchElementSize", "switchMap", "watchViewportAt", "map", "y", "distinctUntilKeyChanged", "mountTabs", "options", "defer", "push$", "Subject", "hidden", "feature", "of", "tap", "state", "finalize", "__spreadValues", "watchTableOfContents", "el", "viewport$", "header$", "table", "anchors", "getElements", "anchor", "id", "target", "getOptionalElement", "adjust$", "distinctUntilKeyChanged", "map", "height", "main", "getComponentElement", "grid", "getElement", "share", "watchElementSize", "switchMap", "body", "defer", "path", "of", "index", "offset", "parent", "a", "b", "combineLatestWith", "adjust", "scan", "prev", "next", "y", "size", "last", "distinctUntilChanged", "startWith", "bufferCount", "mountTableOfContents", "main$", "target$", "push$", "Subject", "done$", "ignoreElements", "endWith", "feature", "smooth$", "merge", "debounceTime", "filter", "observeOn", "asyncScheduler", "withLatestFrom", "behavior", "container", "getElementContainer", "getElementSize", "takeUntil", "skip", "repeat", "url", "getLocation", "active", "hash", "tap", "state", "finalize", "__spreadValues", "watchBackToTop", "_el", "viewport$", "main$", "target$", "direction$", "map", "y", "bufferCount", "b", "distinctUntilChanged", "active$", "active", "combineLatest", "direction", "takeUntil", "skip", "endWith", "repeat", "hidden", "mountBackToTop", "el", "header$", "push$", "Subject", "done$", "ignoreElements", "distinctUntilKeyChanged", "height", "fromEvent", "ev", "tap", "state", "finalize", "__spreadValues", "patchEllipsis", "document$", "viewport$", "switchMap", "getElements", "mergeMap", "el", "watchElementVisibility", "takeUntil", "skip", "filter", "visible", "map", "take", "text", "host", "feature", "mountInlineTooltip2", "finalize", "EMPTY", "patchIndeterminate", "document$", "tablet$", "switchMap", "getElements", "tap", "el", "mergeMap", "fromEvent", "takeWhile", "map", "withLatestFrom", "tablet", "isAppleDevice", "patchScrollfix", "document$", "switchMap", "getElements", "tap", "el", "filter", "mergeMap", "fromEvent", "map", "top", "patchScrolllock", "viewport$", "tablet$", "combineLatest", "watchToggle", "map", "active", "tablet", "switchMap", "of", "delay", "withLatestFrom", "y", "value", "obj", "data", "key", "x", "y", "nodes", "parent", "i", "node", "fetchSearchIndex", "watchScript", "config", "map", "shareReplay", "requestJSON", "document$", "watchDocument", "location$", "watchLocation", "target$", "watchLocationTarget", "keyboard$", "watchKeyboard", "viewport$", "watchViewport", "tablet$", "watchMedia", "screen$", "print$", "watchPrint", "configuration", "index$", "NEVER", "alert$", "Subject", "setupClipboardJS", "progress$", "feature", "setupInstantNavigation", "_a", "setupVersionSelector", "merge", "delay", "setToggle", "filter", "mode", "key", "prev", "getOptionalElement", "setLocation", "next", "active", "getActiveElement", "patchEllipsis", "patchIndeterminate", "patchScrollfix", "patchScrolllock", "header$", "watchHeader", "getComponentElement", "main$", "switchMap", "el", "watchMain", "control$", "getComponentElements", "mountConsent", "mountDialog", "mountHeader", "mountPalette", "mountProgress", "mountSearch", "mountSource", "content$", "defer", "mountAnnounce", "mountContent", "mountSearchHiglight", "EMPTY", "mountHeaderTitle", "at", "mountSidebar", "mountTabs", "mountTableOfContents", "mountBackToTop", "component$", "mergeWith"]
+}
diff --git a/assets/javascripts/lunr/min/lunr.ar.min.js b/assets/javascripts/lunr/min/lunr.ar.min.js
new file mode 100644
index 0000000..9b06c26
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.ar.min.js
@@ -0,0 +1 @@
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.ar=function(){this.pipeline.reset(),this.pipeline.add(e.ar.trimmer,e.ar.stopWordFilter,e.ar.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.ar.stemmer))},e.ar.wordCharacters="ء-ٛٱـ",e.ar.trimmer=e.trimmerSupport.generateTrimmer(e.ar.wordCharacters),e.Pipeline.registerFunction(e.ar.trimmer,"trimmer-ar"),e.ar.stemmer=function(){var e=this;return e.result=!1,e.preRemoved=!1,e.sufRemoved=!1,e.pre={pre1:"ف ك ب و س ل ن ا ي ت",pre2:"ال لل",pre3:"بال وال فال تال كال ولل",pre4:"فبال كبال وبال وكال"},e.suf={suf1:"ه ك ت ن ا ي",suf2:"نك نه ها وك يا اه ون ين تن تم نا وا ان كم كن ني نن ما هم هن تك ته ات يه",suf3:"تين كهم نيه نهم ونه وها يهم ونا ونك وني وهم تكم تنا تها تني تهم كما كها ناه نكم هنا تان يها",suf4:"كموه ناها ونني ونهم تكما تموه تكاه كماه ناكم ناهم نيها وننا"},e.patterns=JSON.parse('{"pt43":[{"pt":[{"c":"ا","l":1}]},{"pt":[{"c":"ا,ت,ن,ي","l":0}],"mPt":[{"c":"ف","l":0,"m":1},{"c":"ع","l":1,"m":2},{"c":"ل","l":2,"m":3}]},{"pt":[{"c":"و","l":2}],"mPt":[{"c":"ف","l":0,"m":0},{"c":"ع","l":1,"m":1},{"c":"ل","l":2,"m":3}]},{"pt":[{"c":"ا","l":2}]},{"pt":[{"c":"ي","l":2}],"mPt":[{"c":"ف","l":0,"m":0},{"c":"ع","l":1,"m":1},{"c":"ا","l":2},{"c":"ل","l":3,"m":3}]},{"pt":[{"c":"م","l":0}]}],"pt53":[{"pt":[{"c":"ت","l":0},{"c":"ا","l":2}]},{"pt":[{"c":"ا,ن,ت,ي","l":0},{"c":"ت","l":2}],"mPt":[{"c":"ا","l":0},{"c":"ف","l":1,"m":1},{"c":"ت","l":2},{"c":"ع","l":3,"m":3},{"c":"ا","l":4},{"c":"ل","l":5,"m":4}]},{"pt":[{"c":"ا","l":0},{"c":"ا","l":2}],"mPt":[{"c":"ا","l":0},{"c":"ف","l":1,"m":1},{"c":"ع","l":2,"m":3},{"c":"ل","l":3,"m":4},{"c":"ا","l":4},{"c":"ل","l":5,"m":4}]},{"pt":[{"c":"ا","l":0},{"c":"ا","l":3}],"mPt":[{"c":"ف","l":0,"m":1},{"c":"ع","l":1,"m":2},{"c":"ل","l":2,"m":4}]},{"pt":[{"c":"ا","l":3},{"c":"ن","l":4}]},{"pt":[{"c":"ت","l":0},{"c":"ي","l":3}]},{"pt":[{"c":"م","l":0},{"c":"و","l":3}]},{"pt":[{"c":"ا","l":1},{"c":"و","l":3}]},{"pt":[{"c":"و","l":1},{"c":"ا","l":2}]},{"pt":[{"c":"م","l":0},{"c":"ا","l":3}]},{"pt":[{"c":"م","l":0},{"c":"ي","l":3}]},{"pt":[{"c":"ا","l":2},{"c":"ن","l":3}]},{"pt":[{"c":"م","l":0},{"c":"ن","l":1}],"mPt":[{"c":"ا","l":0},{"c":"ن","l":1},{"c":"ف","l":2,"m":2},{"c":"ع","l":3,"m":3},{"c":"ا","l":4},{"c":"ل","l":5,"m":4}]},{"pt":[{"c":"م","l":0},{"c":"ت","l":2}],"mPt":[{"c":"ا","l":0},{"c":"ف","l":1,"m":1},{"c":"ت","l":2},{"c":"ع","l":3,"m":3},{"c":"ا","l":4},{"c":"ل","l":5,"m":4}]},{"pt":[{"c":"م","l":0},{"c":"ا","l":2}]},{"pt":[{"c":"م","l":1},{"c":"ا","l":3}]},{"pt":[{"c":"ي,ت,ا,ن","l":0},{"c":"ت","l":1}],"mPt":[{"c":"ف","l":0,"m":2},{"c":"ع","l":1,"m":3},{"c":"ا","l":2},{"c":"ل","l":3,"m":4}]},{"pt":[{"c":"ت,ي,ا,ن","l":0},{"c":"ت","l":2}],"mPt":[{"c":"ا","l":0},{"c":"ف","l":1,"m":1},{"c":"ت","l":2},{"c":"ع","l":3,"m":3},{"c":"ا","l":4},{"c":"ل","l":5,"m":4}]},{"pt":[{"c":"ا","l":2},{"c":"ي","l":3}]},{"pt":[{"c":"ا,ي,ت,ن","l":0},{"c":"ن","l":1}],"mPt":[{"c":"ا","l":0},{"c":"ن","l":1},{"c":"ف","l":2,"m":2},{"c":"ع","l":3,"m":3},{"c":"ا","l":4},{"c":"ل","l":5,"m":4}]},{"pt":[{"c":"ا","l":3},{"c":"ء","l":4}]}],"pt63":[{"pt":[{"c":"ا","l":0},{"c":"ت","l":2},{"c":"ا","l":4}]},{"pt":[{"c":"ا,ت,ن,ي","l":0},{"c":"س","l":1},{"c":"ت","l":2}],"mPt":[{"c":"ا","l":0},{"c":"س","l":1},{"c":"ت","l":2},{"c":"ف","l":3,"m":3},{"c":"ع","l":4,"m":4},{"c":"ا","l":5},{"c":"ل","l":6,"m":5}]},{"pt":[{"c":"ا,ن,ت,ي","l":0},{"c":"و","l":3}]},{"pt":[{"c":"م","l":0},{"c":"س","l":1},{"c":"ت","l":2}],"mPt":[{"c":"ا","l":0},{"c":"س","l":1},{"c":"ت","l":2},{"c":"ف","l":3,"m":3},{"c":"ع","l":4,"m":4},{"c":"ا","l":5},{"c":"ل","l":6,"m":5}]},{"pt":[{"c":"ي","l":1},{"c":"ي","l":3},{"c":"ا","l":4},{"c":"ء","l":5}]},{"pt":[{"c":"ا","l":0},{"c":"ن","l":1},{"c":"ا","l":4}]}],"pt54":[{"pt":[{"c":"ت","l":0}]},{"pt":[{"c":"ا,ي,ت,ن","l":0}],"mPt":[{"c":"ا","l":0},{"c":"ف","l":1,"m":1},{"c":"ع","l":2,"m":2},{"c":"ل","l":3,"m":3},{"c":"ر","l":4,"m":4},{"c":"ا","l":5},{"c":"ر","l":6,"m":4}]},{"pt":[{"c":"م","l":0}],"mPt":[{"c":"ا","l":0},{"c":"ف","l":1,"m":1},{"c":"ع","l":2,"m":2},{"c":"ل","l":3,"m":3},{"c":"ر","l":4,"m":4},{"c":"ا","l":5},{"c":"ر","l":6,"m":4}]},{"pt":[{"c":"ا","l":2}]},{"pt":[{"c":"ا","l":0},{"c":"ن","l":2}]}],"pt64":[{"pt":[{"c":"ا","l":0},{"c":"ا","l":4}]},{"pt":[{"c":"م","l":0},{"c":"ت","l":1}]}],"pt73":[{"pt":[{"c":"ا","l":0},{"c":"س","l":1},{"c":"ت","l":2},{"c":"ا","l":5}]}],"pt75":[{"pt":[{"c":"ا","l":0},{"c":"ا","l":5}]}]}'),e.execArray=["cleanWord","removeDiacritics","cleanAlef","removeStopWords","normalizeHamzaAndAlef","removeStartWaw","removePre432","removeEndTaa","wordCheck"],e.stem=function(){var r=0;for(e.result=!1,e.preRemoved=!1,e.sufRemoved=!1;r<e.execArray.length&&1!=e.result;)e.result=e[e.execArray[r]](),r++},e.setCurrent=function(r){e.word=r},e.getCurrent=function(){return e.word},e.cleanWord=function(){var r=new RegExp("[^ء-ٛٱـ]");return e.word=e.word.replace(new RegExp("ـ","g"),""),!!r.test("")},e.removeDiacritics=function(){new RegExp("[ً-ٛ]");return e.word=e.word.replace(/[\u064b-\u065b]/gi,""),!1},e.cleanAlef=function(){var r=new RegExp("[آأإٱى]");return e.word=e.word.replace(r,"ا"),!1},e.removeStopWords=function(){if("، اض امين اه اها اي ا اب اجل اجمع اخ اخذ اصبح اضحى اقبل اقل اكثر الا ام اما امامك امامك امسى اما ان انا انت انتم انتما انتن انت انشا انى او اوشك اولئك اولئكم اولاء اولالك اوه اي ايا اين اينما اي ان اي اف اذ اذا اذا اذما اذن الى اليكم اليكما اليكن اليك اليك الا اما ان انما اي اياك اياكم اياكما اياكن ايانا اياه اياها اياهم اياهما اياهن اياي ايه ان ا ابتدا اثر اجل احد اخرى اخلولق اذا اربعة ارتد استحال اطار اعادة اعلنت اف اكثر اكد الالاء الالى الا الاخيرة الان الاول الاولى التى التي الثاني الثانية الذاتي الذى الذي الذين السابق الف اللائي اللاتي اللتان اللتيا اللتين اللذان اللذين اللواتي الماضي المقبل الوقت الى اليوم اما امام امس ان انبرى انقلب انه انها او اول اي ايار ايام ايضا ب بات باسم بان بخ برس بسبب بس بشكل بضع بطان بعد بعض بك بكم بكما بكن بل بلى بما بماذا بمن بن بنا به بها بي بيد بين بس بله بئس تان تانك تبدل تجاه تحول تلقاء تلك تلكم تلكما تم تينك تين ته تي ثلاثة ثم ثم ثمة ثم جعل جلل جميع جير حار حاشا حاليا حاي حتى حرى حسب حم حوالى حول حيث حيثما حين حي حبذا حتى حذار خلا خلال دون دونك ذا ذات ذاك ذانك ذان ذلك ذلكم ذلكما ذلكن ذو ذوا ذواتا ذواتي ذيت ذينك ذين ذه ذي راح رجع رويدك ريث رب زيارة سبحان سرعان سنة سنوات سوف سوى ساء ساءما شبه شخصا شرع شتان صار صباح صفر صه صه ضد ضمن طاق طالما طفق طق ظل عاد عام عاما عامة عدا عدة عدد عدم عسى عشر عشرة علق على عليك عليه عليها عل عن عند عندما عوض عين عدس عما غدا غير  ف فان فلان فو فى في فيم فيما فيه فيها قال قام قبل قد قط قلما قوة كانما كاين كاي كاين كاد كان كانت كذا كذلك كرب كل كلا كلاهما كلتا كلم كليكما كليهما كلما كلا كم كما كي كيت كيف كيفما كان كخ لئن لا لات لاسيما لدن لدى لعمر لقاء لك لكم لكما لكن لكنما لكي لكيلا للامم لم لما لما لن لنا له لها لو لوكالة لولا لوما لي لست لست لستم لستما لستن لست لسن لعل لكن ليت ليس ليسا ليستا ليست ليسوا لسنا ما ماانفك مابرح مادام ماذا مازال مافتئ مايو متى مثل مذ مساء مع معاذ مقابل مكانكم مكانكما مكانكن مكانك مليار مليون مما ممن من منذ منها مه مهما من من نحن نحو نعم نفس نفسه نهاية نخ نعما نعم ها هاؤم هاك هاهنا هب هذا هذه هكذا هل هلم هلا هم هما هن هنا هناك هنالك هو هي هيا هيت هيا هؤلاء هاتان هاتين هاته هاتي هج هذا هذان هذين هذه هذي هيهات و وا واحد واضاف واضافت واكد وان واها واوضح وراءك وفي وقال وقالت وقد وقف وكان وكانت ولا ولم ومن وهو وهي ويكان وي وشكان يكون يمكن يوم ايان".split(" ").indexOf(e.word)>=0)return!0},e.normalizeHamzaAndAlef=function(){return e.word=e.word.replace("ؤ","ء"),e.word=e.word.replace("ئ","ء"),e.word=e.word.replace(/([\u0627])\1+/gi,"ا"),!1},e.removeEndTaa=function(){return!(e.word.length>2)||(e.word=e.word.replace(/[\u0627]$/,""),e.word=e.word.replace("ة",""),!1)},e.removeStartWaw=function(){return e.word.length>3&&"و"==e.word[0]&&"و"==e.word[1]&&(e.word=e.word.slice(1)),!1},e.removePre432=function(){var r=e.word;if(e.word.length>=7){var t=new RegExp("^("+e.pre.pre4.split(" ").join("|")+")");e.word=e.word.replace(t,"")}if(e.word==r&&e.word.length>=6){var c=new RegExp("^("+e.pre.pre3.split(" ").join("|")+")");e.word=e.word.replace(c,"")}if(e.word==r&&e.word.length>=5){var l=new RegExp("^("+e.pre.pre2.split(" ").join("|")+")");e.word=e.word.replace(l,"")}return r!=e.word&&(e.preRemoved=!0),!1},e.patternCheck=function(r){for(var t=0;t<r.length;t++){for(var c=!0,l=0;l<r[t].pt.length;l++){var n=r[t].pt[l].c.split(","),o=!1;if(n.forEach(function(c){e.word[r[t].pt[l].l]==c&&(o=!0)}),!o){c=!1;break}}if(1==c){if(r[t].mPt){for(var p=[],m=0;m<r[t].mPt.length;m++)null!=r[t].mPt[m].m?p[r[t].mPt[m].l]=e.word[r[t].mPt[m].m]:p[r[t].mPt[m].l]=r[t].mPt[m].c;e.word=p.join("")}e.result=!0;break}}},e.removePre1=function(){var r=e.word;if(0==e.preRemoved&&e.word.length>3){var t=new RegExp("^("+e.pre.pre1.split(" ").join("|")+")");e.word=e.word.replace(t,"")}return r!=e.word&&(e.preRemoved=!0),!1},e.removeSuf1=function(){var r=e.word;if(0==e.sufRemoved&&e.word.length>3){var t=new RegExp("("+e.suf.suf1.split(" ").join("|")+")$");e.word=e.word.replace(t,"")}return r!=e.word&&(e.sufRemoved=!0),!1},e.removeSuf432=function(){var r=e.word;if(e.word.length>=6){var t=new RegExp("("+e.suf.suf4.split(" ").join("|")+")$");e.word=e.word.replace(t,"")}if(e.word==r&&e.word.length>=5){var c=new RegExp("("+e.suf.suf3.split(" ").join("|")+")$");e.word=e.word.replace(c,"")}if(e.word==r&&e.word.length>=4){var l=new RegExp("("+e.suf.suf2.split(" ").join("|")+")$");e.word=e.word.replace(l,"")}return r!=e.word&&(e.sufRemoved=!0),!1},e.wordCheck=function(){for(var r=(e.word,[e.removeSuf432,e.removeSuf1,e.removePre1]),t=0,c=!1;e.word.length>=7&&!e.result&&t<r.length;)7!=e.word.length||c?(r[t](),t++,c=!1):(e.checkPattern73(),c=!0);var l=[e.checkPattern63,e.removeSuf432,e.removeSuf1,e.removePre1,e.checkPattern64];for(t=0;6==e.word.length&&!e.result&&t<l.length;)l[t](),t++;var n=[e.checkPattern53,e.removeSuf432,e.removeSuf1,e.removePre1,e.checkPattern54];for(t=0;5==e.word.length&&!e.result&&t<n.length;)n[t](),t++;var o=[e.checkPattern43,e.removeSuf1,e.removePre1,e.removeSuf432];for(t=0;4==e.word.length&&!e.result&&t<o.length;)o[t](),t++;return!0},e.checkPattern43=function(){e.patternCheck(e.patterns.pt43)},e.checkPattern53=function(){e.patternCheck(e.patterns.pt53)},e.checkPattern54=function(){e.patternCheck(e.patterns.pt54)},e.checkPattern63=function(){e.patternCheck(e.patterns.pt63)},e.checkPattern64=function(){e.patternCheck(e.patterns.pt64)},e.checkPattern73=function(){e.patternCheck(e.patterns.pt73)},function(r){return"function"==typeof r.update?r.update(function(r){return e.setCurrent(r),e.stem(),e.getCurrent()}):(e.setCurrent(r),e.stem(),e.getCurrent())}}(),e.Pipeline.registerFunction(e.ar.stemmer,"stemmer-ar"),e.ar.stopWordFilter=e.generateStopWordFilter("، اض امين اه اها اي ا اب اجل اجمع اخ اخذ اصبح اضحى اقبل اقل اكثر الا ام اما امامك امامك امسى اما ان انا انت انتم انتما انتن انت انشا انى او اوشك اولئك اولئكم اولاء اولالك اوه اي ايا اين اينما اي ان اي اف اذ اذا اذا اذما اذن الى اليكم اليكما اليكن اليك اليك الا اما ان انما اي اياك اياكم اياكما اياكن ايانا اياه اياها اياهم اياهما اياهن اياي ايه ان ا ابتدا اثر اجل احد اخرى اخلولق اذا اربعة ارتد استحال اطار اعادة اعلنت اف اكثر اكد الالاء الالى الا الاخيرة الان الاول الاولى التى التي الثاني الثانية الذاتي الذى الذي الذين السابق الف اللائي اللاتي اللتان اللتيا اللتين اللذان اللذين اللواتي الماضي المقبل الوقت الى اليوم اما امام امس ان انبرى انقلب انه انها او اول اي ايار ايام ايضا ب بات باسم بان بخ برس بسبب بس بشكل بضع بطان بعد بعض بك بكم بكما بكن بل بلى بما بماذا بمن بن بنا به بها بي بيد بين بس بله بئس تان تانك تبدل تجاه تحول تلقاء تلك تلكم تلكما تم تينك تين ته تي ثلاثة ثم ثم ثمة ثم جعل جلل جميع جير حار حاشا حاليا حاي حتى حرى حسب حم حوالى حول حيث حيثما حين حي حبذا حتى حذار خلا خلال دون دونك ذا ذات ذاك ذانك ذان ذلك ذلكم ذلكما ذلكن ذو ذوا ذواتا ذواتي ذيت ذينك ذين ذه ذي راح رجع رويدك ريث رب زيارة سبحان سرعان سنة سنوات سوف سوى ساء ساءما شبه شخصا شرع شتان صار صباح صفر صه صه ضد ضمن طاق طالما طفق طق ظل عاد عام عاما عامة عدا عدة عدد عدم عسى عشر عشرة علق على عليك عليه عليها عل عن عند عندما عوض عين عدس عما غدا غير  ف فان فلان فو فى في فيم فيما فيه فيها قال قام قبل قد قط قلما قوة كانما كاين كاي كاين كاد كان كانت كذا كذلك كرب كل كلا كلاهما كلتا كلم كليكما كليهما كلما كلا كم كما كي كيت كيف كيفما كان كخ لئن لا لات لاسيما لدن لدى لعمر لقاء لك لكم لكما لكن لكنما لكي لكيلا للامم لم لما لما لن لنا له لها لو لوكالة لولا لوما لي لست لست لستم لستما لستن لست لسن لعل لكن ليت ليس ليسا ليستا ليست ليسوا لسنا ما ماانفك مابرح مادام ماذا مازال مافتئ مايو متى مثل مذ مساء مع معاذ مقابل مكانكم مكانكما مكانكن مكانك مليار مليون مما ممن من منذ منها مه مهما من من نحن نحو نعم نفس نفسه نهاية نخ نعما نعم ها هاؤم هاك هاهنا هب هذا هذه هكذا هل هلم هلا هم هما هن هنا هناك هنالك هو هي هيا هيت هيا هؤلاء هاتان هاتين هاته هاتي هج هذا هذان هذين هذه هذي هيهات وا واحد واضاف واضافت واكد وان واها واوضح وراءك وفي وقال وقالت وقد وقف وكان وكانت ولا ولم ومن وهو وهي ويكان وي وشكان يكون يمكن يوم ايان".split(" ")),e.Pipeline.registerFunction(e.ar.stopWordFilter,"stopWordFilter-ar")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.da.min.js b/assets/javascripts/lunr/min/lunr.da.min.js
new file mode 100644
index 0000000..b9d8509
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.da.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `Danish` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.da=function(){this.pipeline.reset(),this.pipeline.add(e.da.trimmer,e.da.stopWordFilter,e.da.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.da.stemmer))},e.da.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.da.trimmer=e.trimmerSupport.generateTrimmer(e.da.wordCharacters),e.Pipeline.registerFunction(e.da.trimmer,"trimmer-da"),e.da.stemmer=function(){var r=e.stemmerSupport.Among,i=e.stemmerSupport.SnowballProgram,n=new function(){function e(){var e,r=f.cursor+3;if(d=f.limit,0<=r&&r<=f.limit){for(a=r;;){if(e=f.cursor,f.in_grouping(w,97,248)){f.cursor=e;break}if(f.cursor=e,e>=f.limit)return;f.cursor++}for(;!f.out_grouping(w,97,248);){if(f.cursor>=f.limit)return;f.cursor++}d=f.cursor,d<a&&(d=a)}}function n(){var e,r;if(f.cursor>=d&&(r=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,e=f.find_among_b(c,32),f.limit_backward=r,e))switch(f.bra=f.cursor,e){case 1:f.slice_del();break;case 2:f.in_grouping_b(p,97,229)&&f.slice_del()}}function t(){var e,r=f.limit-f.cursor;f.cursor>=d&&(e=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,f.find_among_b(l,4)?(f.bra=f.cursor,f.limit_backward=e,f.cursor=f.limit-r,f.cursor>f.limit_backward&&(f.cursor--,f.bra=f.cursor,f.slice_del())):f.limit_backward=e)}function s(){var e,r,i,n=f.limit-f.cursor;if(f.ket=f.cursor,f.eq_s_b(2,"st")&&(f.bra=f.cursor,f.eq_s_b(2,"ig")&&f.slice_del()),f.cursor=f.limit-n,f.cursor>=d&&(r=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,e=f.find_among_b(m,5),f.limit_backward=r,e))switch(f.bra=f.cursor,e){case 1:f.slice_del(),i=f.limit-f.cursor,t(),f.cursor=f.limit-i;break;case 2:f.slice_from("løs")}}function o(){var e;f.cursor>=d&&(e=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,f.out_grouping_b(w,97,248)?(f.bra=f.cursor,u=f.slice_to(u),f.limit_backward=e,f.eq_v_b(u)&&f.slice_del()):f.limit_backward=e)}var a,d,u,c=[new r("hed",-1,1),new r("ethed",0,1),new r("ered",-1,1),new r("e",-1,1),new r("erede",3,1),new r("ende",3,1),new r("erende",5,1),new r("ene",3,1),new r("erne",3,1),new r("ere",3,1),new r("en",-1,1),new r("heden",10,1),new r("eren",10,1),new r("er",-1,1),new r("heder",13,1),new r("erer",13,1),new r("s",-1,2),new r("heds",16,1),new r("es",16,1),new r("endes",18,1),new r("erendes",19,1),new r("enes",18,1),new r("ernes",18,1),new r("eres",18,1),new r("ens",16,1),new r("hedens",24,1),new r("erens",24,1),new r("ers",16,1),new r("ets",16,1),new r("erets",28,1),new r("et",-1,1),new r("eret",30,1)],l=[new r("gd",-1,-1),new r("dt",-1,-1),new r("gt",-1,-1),new r("kt",-1,-1)],m=[new r("ig",-1,1),new r("lig",0,1),new r("elig",1,1),new r("els",-1,1),new r("løst",-1,2)],w=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,48,0,128],p=[239,254,42,3,0,0,0,0,0,0,0,0,0,0,0,0,16],f=new i;this.setCurrent=function(e){f.setCurrent(e)},this.getCurrent=function(){return f.getCurrent()},this.stem=function(){var r=f.cursor;return e(),f.limit_backward=r,f.cursor=f.limit,n(),f.cursor=f.limit,t(),f.cursor=f.limit,s(),f.cursor=f.limit,o(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return n.setCurrent(e),n.stem(),n.getCurrent()}):(n.setCurrent(e),n.stem(),n.getCurrent())}}(),e.Pipeline.registerFunction(e.da.stemmer,"stemmer-da"),e.da.stopWordFilter=e.generateStopWordFilter("ad af alle alt anden at blev blive bliver da de dem den denne der deres det dette dig din disse dog du efter eller en end er et for fra ham han hans har havde have hende hendes her hos hun hvad hvis hvor i ikke ind jeg jer jo kunne man mange med meget men mig min mine mit mod ned noget nogle nu når og også om op os over på selv sig sin sine sit skal skulle som sådan thi til ud under var vi vil ville vor være været".split(" ")),e.Pipeline.registerFunction(e.da.stopWordFilter,"stopWordFilter-da")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.de.min.js b/assets/javascripts/lunr/min/lunr.de.min.js
new file mode 100644
index 0000000..f3b5c10
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.de.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `German` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.de=function(){this.pipeline.reset(),this.pipeline.add(e.de.trimmer,e.de.stopWordFilter,e.de.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.de.stemmer))},e.de.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.de.trimmer=e.trimmerSupport.generateTrimmer(e.de.wordCharacters),e.Pipeline.registerFunction(e.de.trimmer,"trimmer-de"),e.de.stemmer=function(){var r=e.stemmerSupport.Among,n=e.stemmerSupport.SnowballProgram,i=new function(){function e(e,r,n){return!(!v.eq_s(1,e)||(v.ket=v.cursor,!v.in_grouping(p,97,252)))&&(v.slice_from(r),v.cursor=n,!0)}function i(){for(var r,n,i,s,t=v.cursor;;)if(r=v.cursor,v.bra=r,v.eq_s(1,"ß"))v.ket=v.cursor,v.slice_from("ss");else{if(r>=v.limit)break;v.cursor=r+1}for(v.cursor=t;;)for(n=v.cursor;;){if(i=v.cursor,v.in_grouping(p,97,252)){if(s=v.cursor,v.bra=s,e("u","U",i))break;if(v.cursor=s,e("y","Y",i))break}if(i>=v.limit)return void(v.cursor=n);v.cursor=i+1}}function s(){for(;!v.in_grouping(p,97,252);){if(v.cursor>=v.limit)return!0;v.cursor++}for(;!v.out_grouping(p,97,252);){if(v.cursor>=v.limit)return!0;v.cursor++}return!1}function t(){m=v.limit,l=m;var e=v.cursor+3;0<=e&&e<=v.limit&&(d=e,s()||(m=v.cursor,m<d&&(m=d),s()||(l=v.cursor)))}function o(){for(var e,r;;){if(r=v.cursor,v.bra=r,!(e=v.find_among(h,6)))return;switch(v.ket=v.cursor,e){case 1:v.slice_from("y");break;case 2:case 5:v.slice_from("u");break;case 3:v.slice_from("a");break;case 4:v.slice_from("o");break;case 6:if(v.cursor>=v.limit)return;v.cursor++}}}function c(){return m<=v.cursor}function u(){return l<=v.cursor}function a(){var e,r,n,i,s=v.limit-v.cursor;if(v.ket=v.cursor,(e=v.find_among_b(w,7))&&(v.bra=v.cursor,c()))switch(e){case 1:v.slice_del();break;case 2:v.slice_del(),v.ket=v.cursor,v.eq_s_b(1,"s")&&(v.bra=v.cursor,v.eq_s_b(3,"nis")&&v.slice_del());break;case 3:v.in_grouping_b(g,98,116)&&v.slice_del()}if(v.cursor=v.limit-s,v.ket=v.cursor,(e=v.find_among_b(f,4))&&(v.bra=v.cursor,c()))switch(e){case 1:v.slice_del();break;case 2:if(v.in_grouping_b(k,98,116)){var t=v.cursor-3;v.limit_backward<=t&&t<=v.limit&&(v.cursor=t,v.slice_del())}}if(v.cursor=v.limit-s,v.ket=v.cursor,(e=v.find_among_b(_,8))&&(v.bra=v.cursor,u()))switch(e){case 1:v.slice_del(),v.ket=v.cursor,v.eq_s_b(2,"ig")&&(v.bra=v.cursor,r=v.limit-v.cursor,v.eq_s_b(1,"e")||(v.cursor=v.limit-r,u()&&v.slice_del()));break;case 2:n=v.limit-v.cursor,v.eq_s_b(1,"e")||(v.cursor=v.limit-n,v.slice_del());break;case 3:if(v.slice_del(),v.ket=v.cursor,i=v.limit-v.cursor,!v.eq_s_b(2,"er")&&(v.cursor=v.limit-i,!v.eq_s_b(2,"en")))break;v.bra=v.cursor,c()&&v.slice_del();break;case 4:v.slice_del(),v.ket=v.cursor,e=v.find_among_b(b,2),e&&(v.bra=v.cursor,u()&&1==e&&v.slice_del())}}var d,l,m,h=[new r("",-1,6),new r("U",0,2),new r("Y",0,1),new r("ä",0,3),new r("ö",0,4),new r("ü",0,5)],w=[new r("e",-1,2),new r("em",-1,1),new r("en",-1,2),new r("ern",-1,1),new r("er",-1,1),new r("s",-1,3),new r("es",5,2)],f=[new r("en",-1,1),new r("er",-1,1),new r("st",-1,2),new r("est",2,1)],b=[new r("ig",-1,1),new r("lich",-1,1)],_=[new r("end",-1,1),new r("ig",-1,2),new r("ung",-1,1),new r("lich",-1,3),new r("isch",-1,2),new r("ik",-1,2),new r("heit",-1,3),new r("keit",-1,4)],p=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,8,0,32,8],g=[117,30,5],k=[117,30,4],v=new n;this.setCurrent=function(e){v.setCurrent(e)},this.getCurrent=function(){return v.getCurrent()},this.stem=function(){var e=v.cursor;return i(),v.cursor=e,t(),v.limit_backward=e,v.cursor=v.limit,a(),v.cursor=v.limit_backward,o(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return i.setCurrent(e),i.stem(),i.getCurrent()}):(i.setCurrent(e),i.stem(),i.getCurrent())}}(),e.Pipeline.registerFunction(e.de.stemmer,"stemmer-de"),e.de.stopWordFilter=e.generateStopWordFilter("aber alle allem allen aller alles als also am an ander andere anderem anderen anderer anderes anderm andern anderr anders auch auf aus bei bin bis bist da damit dann das dasselbe dazu daß dein deine deinem deinen deiner deines dem demselben den denn denselben der derer derselbe derselben des desselben dessen dich die dies diese dieselbe dieselben diesem diesen dieser dieses dir doch dort du durch ein eine einem einen einer eines einig einige einigem einigen einiger einiges einmal er es etwas euch euer eure eurem euren eurer eures für gegen gewesen hab habe haben hat hatte hatten hier hin hinter ich ihm ihn ihnen ihr ihre ihrem ihren ihrer ihres im in indem ins ist jede jedem jeden jeder jedes jene jenem jenen jener jenes jetzt kann kein keine keinem keinen keiner keines können könnte machen man manche manchem manchen mancher manches mein meine meinem meinen meiner meines mich mir mit muss musste nach nicht nichts noch nun nur ob oder ohne sehr sein seine seinem seinen seiner seines selbst sich sie sind so solche solchem solchen solcher solches soll sollte sondern sonst um und uns unse unsem unsen unser unses unter viel vom von vor war waren warst was weg weil weiter welche welchem welchen welcher welches wenn werde werden wie wieder will wir wird wirst wo wollen wollte während würde würden zu zum zur zwar zwischen über".split(" ")),e.Pipeline.registerFunction(e.de.stopWordFilter,"stopWordFilter-de")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.du.min.js b/assets/javascripts/lunr/min/lunr.du.min.js
new file mode 100644
index 0000000..49a0f3f
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.du.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `Dutch` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");console.warn('[Lunr Languages] Please use the "nl" instead of the "du". The "nl" code is the standard code for Dutch language, and "du" will be removed in the next major versions.'),e.du=function(){this.pipeline.reset(),this.pipeline.add(e.du.trimmer,e.du.stopWordFilter,e.du.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.du.stemmer))},e.du.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.du.trimmer=e.trimmerSupport.generateTrimmer(e.du.wordCharacters),e.Pipeline.registerFunction(e.du.trimmer,"trimmer-du"),e.du.stemmer=function(){var r=e.stemmerSupport.Among,i=e.stemmerSupport.SnowballProgram,n=new function(){function e(){for(var e,r,i,o=C.cursor;;){if(C.bra=C.cursor,e=C.find_among(b,11))switch(C.ket=C.cursor,e){case 1:C.slice_from("a");continue;case 2:C.slice_from("e");continue;case 3:C.slice_from("i");continue;case 4:C.slice_from("o");continue;case 5:C.slice_from("u");continue;case 6:if(C.cursor>=C.limit)break;C.cursor++;continue}break}for(C.cursor=o,C.bra=o,C.eq_s(1,"y")?(C.ket=C.cursor,C.slice_from("Y")):C.cursor=o;;)if(r=C.cursor,C.in_grouping(q,97,232)){if(i=C.cursor,C.bra=i,C.eq_s(1,"i"))C.ket=C.cursor,C.in_grouping(q,97,232)&&(C.slice_from("I"),C.cursor=r);else if(C.cursor=i,C.eq_s(1,"y"))C.ket=C.cursor,C.slice_from("Y"),C.cursor=r;else if(n(r))break}else if(n(r))break}function n(e){return C.cursor=e,e>=C.limit||(C.cursor++,!1)}function o(){_=C.limit,f=_,t()||(_=C.cursor,_<3&&(_=3),t()||(f=C.cursor))}function t(){for(;!C.in_grouping(q,97,232);){if(C.cursor>=C.limit)return!0;C.cursor++}for(;!C.out_grouping(q,97,232);){if(C.cursor>=C.limit)return!0;C.cursor++}return!1}function s(){for(var e;;)if(C.bra=C.cursor,e=C.find_among(p,3))switch(C.ket=C.cursor,e){case 1:C.slice_from("y");break;case 2:C.slice_from("i");break;case 3:if(C.cursor>=C.limit)return;C.cursor++}}function u(){return _<=C.cursor}function c(){return f<=C.cursor}function a(){var e=C.limit-C.cursor;C.find_among_b(g,3)&&(C.cursor=C.limit-e,C.ket=C.cursor,C.cursor>C.limit_backward&&(C.cursor--,C.bra=C.cursor,C.slice_del()))}function l(){var e;w=!1,C.ket=C.cursor,C.eq_s_b(1,"e")&&(C.bra=C.cursor,u()&&(e=C.limit-C.cursor,C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-e,C.slice_del(),w=!0,a())))}function m(){var e;u()&&(e=C.limit-C.cursor,C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-e,C.eq_s_b(3,"gem")||(C.cursor=C.limit-e,C.slice_del(),a())))}function d(){var e,r,i,n,o,t,s=C.limit-C.cursor;if(C.ket=C.cursor,e=C.find_among_b(h,5))switch(C.bra=C.cursor,e){case 1:u()&&C.slice_from("heid");break;case 2:m();break;case 3:u()&&C.out_grouping_b(z,97,232)&&C.slice_del()}if(C.cursor=C.limit-s,l(),C.cursor=C.limit-s,C.ket=C.cursor,C.eq_s_b(4,"heid")&&(C.bra=C.cursor,c()&&(r=C.limit-C.cursor,C.eq_s_b(1,"c")||(C.cursor=C.limit-r,C.slice_del(),C.ket=C.cursor,C.eq_s_b(2,"en")&&(C.bra=C.cursor,m())))),C.cursor=C.limit-s,C.ket=C.cursor,e=C.find_among_b(k,6))switch(C.bra=C.cursor,e){case 1:if(c()){if(C.slice_del(),i=C.limit-C.cursor,C.ket=C.cursor,C.eq_s_b(2,"ig")&&(C.bra=C.cursor,c()&&(n=C.limit-C.cursor,!C.eq_s_b(1,"e")))){C.cursor=C.limit-n,C.slice_del();break}C.cursor=C.limit-i,a()}break;case 2:c()&&(o=C.limit-C.cursor,C.eq_s_b(1,"e")||(C.cursor=C.limit-o,C.slice_del()));break;case 3:c()&&(C.slice_del(),l());break;case 4:c()&&C.slice_del();break;case 5:c()&&w&&C.slice_del()}C.cursor=C.limit-s,C.out_grouping_b(j,73,232)&&(t=C.limit-C.cursor,C.find_among_b(v,4)&&C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-t,C.ket=C.cursor,C.cursor>C.limit_backward&&(C.cursor--,C.bra=C.cursor,C.slice_del())))}var f,_,w,b=[new r("",-1,6),new r("á",0,1),new r("ä",0,1),new r("é",0,2),new r("ë",0,2),new r("í",0,3),new r("ï",0,3),new r("ó",0,4),new r("ö",0,4),new r("ú",0,5),new r("ü",0,5)],p=[new r("",-1,3),new r("I",0,2),new r("Y",0,1)],g=[new r("dd",-1,-1),new r("kk",-1,-1),new r("tt",-1,-1)],h=[new r("ene",-1,2),new r("se",-1,3),new r("en",-1,2),new r("heden",2,1),new r("s",-1,3)],k=[new r("end",-1,1),new r("ig",-1,2),new r("ing",-1,1),new r("lijk",-1,3),new r("baar",-1,4),new r("bar",-1,5)],v=[new r("aa",-1,-1),new r("ee",-1,-1),new r("oo",-1,-1),new r("uu",-1,-1)],q=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],j=[1,0,0,17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],z=[17,67,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],C=new i;this.setCurrent=function(e){C.setCurrent(e)},this.getCurrent=function(){return C.getCurrent()},this.stem=function(){var r=C.cursor;return e(),C.cursor=r,o(),C.limit_backward=r,C.cursor=C.limit,d(),C.cursor=C.limit_backward,s(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return n.setCurrent(e),n.stem(),n.getCurrent()}):(n.setCurrent(e),n.stem(),n.getCurrent())}}(),e.Pipeline.registerFunction(e.du.stemmer,"stemmer-du"),e.du.stopWordFilter=e.generateStopWordFilter(" aan al alles als altijd andere ben bij daar dan dat de der deze die dit doch doen door dus een eens en er ge geen geweest haar had heb hebben heeft hem het hier hij hoe hun iemand iets ik in is ja je kan kon kunnen maar me meer men met mij mijn moet na naar niet niets nog nu of om omdat onder ons ook op over reeds te tegen toch toen tot u uit uw van veel voor want waren was wat werd wezen wie wil worden wordt zal ze zelf zich zij zijn zo zonder zou".split(" ")),e.Pipeline.registerFunction(e.du.stopWordFilter,"stopWordFilter-du")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.el.min.js b/assets/javascripts/lunr/min/lunr.el.min.js
new file mode 100644
index 0000000..ace017b
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.el.min.js
@@ -0,0 +1 @@
+!function(e,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.el=function(){this.pipeline.reset(),void 0===this.searchPipeline&&this.pipeline.add(e.el.trimmer,e.el.normilizer),this.pipeline.add(e.el.stopWordFilter,e.el.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.el.stemmer))},e.el.wordCharacters="A-Za-zΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩωΆάΈέΉήΊίΌόΎύΏώΪΐΫΰΐΰ",e.el.trimmer=e.trimmerSupport.generateTrimmer(e.el.wordCharacters),e.Pipeline.registerFunction(e.el.trimmer,"trimmer-el"),e.el.stemmer=function(){function e(e){return s.test(e)}function t(e){return/[ΑΕΗΙΟΥΩ]$/.test(e)}function r(e){return/[ΑΕΗΙΟΩ]$/.test(e)}function n(n){var s=n;if(n.length<3)return s;if(!e(n))return s;if(i.indexOf(n)>=0)return s;var u=new RegExp("(.*)("+Object.keys(l).join("|")+")$"),o=u.exec(s);return null!==o&&(s=o[1]+l[o[2]]),null!==(o=/^(.+?)(ΑΔΕΣ|ΑΔΩΝ)$/.exec(s))&&(s=o[1],/(ΟΚ|ΜΑΜ|ΜΑΝ|ΜΠΑΜΠ|ΠΑΤΕΡ|ΓΙΑΓΙ|ΝΤΑΝΤ|ΚΥΡ|ΘΕΙ|ΠΕΘΕΡ|ΜΟΥΣΑΜ|ΚΑΠΛΑΜ|ΠΑΡ|ΨΑΡ|ΤΖΟΥΡ|ΤΑΜΠΟΥΡ|ΓΑΛΑΤ|ΦΑΦΛΑΤ)$/.test(o[1])||(s+="ΑΔ")),null!==(o=/^(.+?)(ΕΔΕΣ|ΕΔΩΝ)$/.exec(s))&&(s=o[1],/(ΟΠ|ΙΠ|ΕΜΠ|ΥΠ|ΓΗΠ|ΔΑΠ|ΚΡΑΣΠ|ΜΙΛ)$/.test(o[1])&&(s+="ΕΔ")),null!==(o=/^(.+?)(ΟΥΔΕΣ|ΟΥΔΩΝ)$/.exec(s))&&(s=o[1],/(ΑΡΚ|ΚΑΛΙΑΚ|ΠΕΤΑΛ|ΛΙΧ|ΠΛΕΞ|ΣΚ|Σ|ΦΛ|ΦΡ|ΒΕΛ|ΛΟΥΛ|ΧΝ|ΣΠ|ΤΡΑΓ|ΦΕ)$/.test(o[1])&&(s+="ΟΥΔ")),null!==(o=/^(.+?)(ΕΩΣ|ΕΩΝ|ΕΑΣ|ΕΑ)$/.exec(s))&&(s=o[1],/^(Θ|Δ|ΕΛ|ΓΑΛ|Ν|Π|ΙΔ|ΠΑΡ|ΣΤΕΡ|ΟΡΦ|ΑΝΔΡ|ΑΝΤΡ)$/.test(o[1])&&(s+="Ε")),null!==(o=/^(.+?)(ΕΙΟ|ΕΙΟΣ|ΕΙΟΙ|ΕΙΑ|ΕΙΑΣ|ΕΙΕΣ|ΕΙΟΥ|ΕΙΟΥΣ|ΕΙΩΝ)$/.exec(s))&&o[1].length>4&&(s=o[1]),null!==(o=/^(.+?)(ΙΟΥΣ|ΙΑΣ|ΙΕΣ|ΙΟΣ|ΙΟΥ|ΙΟΙ|ΙΩΝ|ΙΟΝ|ΙΑ|ΙΟ)$/.exec(s))&&(s=o[1],(t(s)||s.length<2||/^(ΑΓ|ΑΓΓΕΛ|ΑΓΡ|ΑΕΡ|ΑΘΛ|ΑΚΟΥΣ|ΑΞ|ΑΣ|Β|ΒΙΒΛ|ΒΥΤ|Γ|ΓΙΑΓ|ΓΩΝ|Δ|ΔΑΝ|ΔΗΛ|ΔΗΜ|ΔΟΚΙΜ|ΕΛ|ΖΑΧΑΡ|ΗΛ|ΗΠ|ΙΔ|ΙΣΚ|ΙΣΤ|ΙΟΝ|ΙΩΝ|ΚΙΜΩΛ|ΚΟΛΟΝ|ΚΟΡ|ΚΤΗΡ|ΚΥΡ|ΛΑΓ|ΛΟΓ|ΜΑΓ|ΜΠΑΝ|ΜΠΡ|ΝΑΥΤ|ΝΟΤ|ΟΠΑΛ|ΟΞ|ΟΡ|ΟΣ|ΠΑΝΑΓ|ΠΑΤΡ|ΠΗΛ|ΠΗΝ|ΠΛΑΙΣ|ΠΟΝΤ|ΡΑΔ|ΡΟΔ|ΣΚ|ΣΚΟΡΠ|ΣΟΥΝ|ΣΠΑΝ|ΣΤΑΔ|ΣΥΡ|ΤΗΛ|ΤΙΜ|ΤΟΚ|ΤΟΠ|ΤΡΟΧ|ΦΙΛ|ΦΩΤ|Χ|ΧΙΛ|ΧΡΩΜ|ΧΩΡ)$/.test(o[1]))&&(s+="Ι"),/^(ΠΑΛ)$/.test(o[1])&&(s+="ΑΙ")),null!==(o=/^(.+?)(ΙΚΟΣ|ΙΚΟΝ|ΙΚΕΙΣ|ΙΚΟΙ|ΙΚΕΣ|ΙΚΟΥΣ|ΙΚΗ|ΙΚΗΣ|ΙΚΟ|ΙΚΑ|ΙΚΟΥ|ΙΚΩΝ|ΙΚΩΣ)$/.exec(s))&&(s=o[1],(t(s)||/^(ΑΔ|ΑΛ|ΑΜΑΝ|ΑΜΕΡ|ΑΜΜΟΧΑΛ|ΑΝΗΘ|ΑΝΤΙΔ|ΑΠΛ|ΑΤΤ|ΑΦΡ|ΒΑΣ|ΒΡΩΜ|ΓΕΝ|ΓΕΡ|Δ|ΔΙΚΑΝ|ΔΥΤ|ΕΙΔ|ΕΝΔ|ΕΞΩΔ|ΗΘ|ΘΕΤ|ΚΑΛΛΙΝ|ΚΑΛΠ|ΚΑΤΑΔ|ΚΟΥΖΙΝ|ΚΡ|ΚΩΔ|ΛΟΓ|Μ|ΜΕΡ|ΜΟΝΑΔ|ΜΟΥΛ|ΜΟΥΣ|ΜΠΑΓΙΑΤ|ΜΠΑΝ|ΜΠΟΛ|ΜΠΟΣ|ΜΥΣΤ|Ν|ΝΙΤ|ΞΙΚ|ΟΠΤ|ΠΑΝ|ΠΕΤΣ|ΠΙΚΑΝΤ|ΠΙΤΣ|ΠΛΑΣΤ|ΠΛΙΑΤΣ|ΠΟΝΤ|ΠΟΣΤΕΛΝ|ΠΡΩΤΟΔ|ΣΕΡΤ|ΣΗΜΑΝΤ|ΣΤΑΤ|ΣΥΝΑΔ|ΣΥΝΟΜΗΛ|ΤΕΛ|ΤΕΧΝ|ΤΡΟΠ|ΤΣΑΜ|ΥΠΟΔ|Φ|ΦΙΛΟΝ|ΦΥΛΟΔ|ΦΥΣ|ΧΑΣ)$/.test(o[1])||/(ΦΟΙΝ)$/.test(o[1]))&&(s+="ΙΚ")),"ΑΓΑΜΕ"===s&&(s="ΑΓΑΜ"),null!==(o=/^(.+?)(ΑΓΑΜΕ|ΗΣΑΜΕ|ΟΥΣΑΜΕ|ΗΚΑΜΕ|ΗΘΗΚΑΜΕ)$/.exec(s))&&(s=o[1]),null!==(o=/^(.+?)(ΑΜΕ)$/.exec(s))&&(s=o[1],/^(ΑΝΑΠ|ΑΠΟΘ|ΑΠΟΚ|ΑΠΟΣΤ|ΒΟΥΒ|ΞΕΘ|ΟΥΛ|ΠΕΘ|ΠΙΚΡ|ΠΟΤ|ΣΙΧ|Χ)$/.test(o[1])&&(s+="ΑΜ")),null!==(o=/^(.+?)(ΑΓΑΝΕ|ΗΣΑΝΕ|ΟΥΣΑΝΕ|ΙΟΝΤΑΝΕ|ΙΟΤΑΝΕ|ΙΟΥΝΤΑΝΕ|ΟΝΤΑΝΕ|ΟΤΑΝΕ|ΟΥΝΤΑΝΕ|ΗΚΑΝΕ|ΗΘΗΚΑΝΕ)$/.exec(s))&&(s=o[1],/^(ΤΡ|ΤΣ)$/.test(o[1])&&(s+="ΑΓΑΝ")),null!==(o=/^(.+?)(ΑΝΕ)$/.exec(s))&&(s=o[1],(r(s)||/^(ΒΕΤΕΡ|ΒΟΥΛΚ|ΒΡΑΧΜ|Γ|ΔΡΑΔΟΥΜ|Θ|ΚΑΛΠΟΥΖ|ΚΑΣΤΕΛ|ΚΟΡΜΟΡ|ΛΑΟΠΛ|ΜΩΑΜΕΘ|Μ|ΜΟΥΣΟΥΛΜΑΝ|ΟΥΛ|Π|ΠΕΛΕΚ|ΠΛ|ΠΟΛΙΣ|ΠΟΡΤΟΛ|ΣΑΡΑΚΑΤΣ|ΣΟΥΛΤ|ΤΣΑΡΛΑΤ|ΟΡΦ|ΤΣΙΓΓ|ΤΣΟΠ|ΦΩΤΟΣΤΕΦ|Χ|ΨΥΧΟΠΛ|ΑΓ|ΟΡΦ|ΓΑΛ|ΓΕΡ|ΔΕΚ|ΔΙΠΛ|ΑΜΕΡΙΚΑΝ|ΟΥΡ|ΠΙΘ|ΠΟΥΡΙΤ|Σ|ΖΩΝΤ|ΙΚ|ΚΑΣΤ|ΚΟΠ|ΛΙΧ|ΛΟΥΘΗΡ|ΜΑΙΝΤ|ΜΕΛ|ΣΙΓ|ΣΠ|ΣΤΕΓ|ΤΡΑΓ|ΤΣΑΓ|Φ|ΕΡ|ΑΔΑΠ|ΑΘΙΓΓ|ΑΜΗΧ|ΑΝΙΚ|ΑΝΟΡΓ|ΑΠΗΓ|ΑΠΙΘ|ΑΤΣΙΓΓ|ΒΑΣ|ΒΑΣΚ|ΒΑΘΥΓΑΛ|ΒΙΟΜΗΧ|ΒΡΑΧΥΚ|ΔΙΑΤ|ΔΙΑΦ|ΕΝΟΡΓ|ΘΥΣ|ΚΑΠΝΟΒΙΟΜΗΧ|ΚΑΤΑΓΑΛ|ΚΛΙΒ|ΚΟΙΛΑΡΦ|ΛΙΒ|ΜΕΓΛΟΒΙΟΜΗΧ|ΜΙΚΡΟΒΙΟΜΗΧ|ΝΤΑΒ|ΞΗΡΟΚΛΙΒ|ΟΛΙΓΟΔΑΜ|ΟΛΟΓΑΛ|ΠΕΝΤΑΡΦ|ΠΕΡΗΦ|ΠΕΡΙΤΡ|ΠΛΑΤ|ΠΟΛΥΔΑΠ|ΠΟΛΥΜΗΧ|ΣΤΕΦ|ΤΑΒ|ΤΕΤ|ΥΠΕΡΗΦ|ΥΠΟΚΟΠ|ΧΑΜΗΛΟΔΑΠ|ΨΗΛΟΤΑΒ)$/.test(o[1]))&&(s+="ΑΝ")),null!==(o=/^(.+?)(ΗΣΕΤΕ)$/.exec(s))&&(s=o[1]),null!==(o=/^(.+?)(ΕΤΕ)$/.exec(s))&&(s=o[1],(r(s)||/(ΟΔ|ΑΙΡ|ΦΟΡ|ΤΑΘ|ΔΙΑΘ|ΣΧ|ΕΝΔ|ΕΥΡ|ΤΙΘ|ΥΠΕΡΘ|ΡΑΘ|ΕΝΘ|ΡΟΘ|ΣΘ|ΠΥΡ|ΑΙΝ|ΣΥΝΔ|ΣΥΝ|ΣΥΝΘ|ΧΩΡ|ΠΟΝ|ΒΡ|ΚΑΘ|ΕΥΘ|ΕΚΘ|ΝΕΤ|ΡΟΝ|ΑΡΚ|ΒΑΡ|ΒΟΛ|ΩΦΕΛ)$/.test(o[1])||/^(ΑΒΑΡ|ΒΕΝ|ΕΝΑΡ|ΑΒΡ|ΑΔ|ΑΘ|ΑΝ|ΑΠΛ|ΒΑΡΟΝ|ΝΤΡ|ΣΚ|ΚΟΠ|ΜΠΟΡ|ΝΙΦ|ΠΑΓ|ΠΑΡΑΚΑΛ|ΣΕΡΠ|ΣΚΕΛ|ΣΥΡΦ|ΤΟΚ|Υ|Δ|ΕΜ|ΘΑΡΡ|Θ)$/.test(o[1]))&&(s+="ΕΤ")),null!==(o=/^(.+?)(ΟΝΤΑΣ|ΩΝΤΑΣ)$/.exec(s))&&(s=o[1],/^ΑΡΧ$/.test(o[1])&&(s+="ΟΝΤ"),/ΚΡΕ$/.test(o[1])&&(s+="ΩΝΤ")),null!==(o=/^(.+?)(ΟΜΑΣΤΕ|ΙΟΜΑΣΤΕ)$/.exec(s))&&(s=o[1],/^ΟΝ$/.test(o[1])&&(s+="ΟΜΑΣΤ")),null!==(o=/^(.+?)(ΙΕΣΤΕ)$/.exec(s))&&(s=o[1],/^(Π|ΑΠ|ΣΥΜΠ|ΑΣΥΜΠ|ΑΚΑΤΑΠ|ΑΜΕΤΑΜΦ)$/.test(o[1])&&(s+="ΙΕΣΤ")),null!==(o=/^(.+?)(ΕΣΤΕ)$/.exec(s))&&(s=o[1],/^(ΑΛ|ΑΡ|ΕΚΤΕΛ|Ζ|Μ|Ξ|ΠΑΡΑΚΑΛ|ΠΡΟ|ΝΙΣ)$/.test(o[1])&&(s+="ΕΣΤ")),null!==(o=/^(.+?)(ΗΘΗΚΑ|ΗΘΗΚΕΣ|ΗΘΗΚΕ)$/.exec(s))&&(s=o[1]),null!==(o=/^(.+?)(ΗΚΑ|ΗΚΕΣ|ΗΚΕ)$/.exec(s))&&(s=o[1],(/(ΣΚΩΛ|ΣΚΟΥΛ|ΝΑΡΘ|ΣΦ|ΟΘ|ΠΙΘ)$/.test(o[1])||/^(ΔΙΑΘ|Θ|ΠΑΡΑΚΑΤΑΘ|ΠΡΟΣΘ|ΣΥΝΘ)$/.test(o[1]))&&(s+="ΗΚ")),null!==(o=/^(.+?)(ΟΥΣΑ|ΟΥΣΕΣ|ΟΥΣΕ)$/.exec(s))&&(s=o[1],(t(s)||/^(ΦΑΡΜΑΚ|ΧΑΔ|ΑΓΚ|ΑΝΑΡΡ|ΒΡΟΜ|ΕΚΛΙΠ|ΛΑΜΠΙΔ|ΛΕΧ|Μ|ΠΑΤ|Ρ|Λ|ΜΕΔ|ΜΕΣΑΖ|ΥΠΟΤΕΙΝ|ΑΜ|ΑΙΘ|ΑΝΗΚ|ΔΕΣΠΟΖ|ΕΝΔΙΑΦΕΡ)$/.test(o[1])||/(ΠΟΔΑΡ|ΒΛΕΠ|ΠΑΝΤΑΧ|ΦΡΥΔ|ΜΑΝΤΙΛ|ΜΑΛΛ|ΚΥΜΑΤ|ΛΑΧ|ΛΗΓ|ΦΑΓ|ΟΜ|ΠΡΩΤ)$/.test(o[1]))&&(s+="ΟΥΣ")),null!==(o=/^(.+?)(ΑΓΑ|ΑΓΕΣ|ΑΓΕ)$/.exec(s))&&(s=o[1],(/^(ΑΒΑΣΤ|ΠΟΛΥΦ|ΑΔΗΦ|ΠΑΜΦ|Ρ|ΑΣΠ|ΑΦ|ΑΜΑΛ|ΑΜΑΛΛΙ|ΑΝΥΣΤ|ΑΠΕΡ|ΑΣΠΑΡ|ΑΧΑΡ|ΔΕΡΒΕΝ|ΔΡΟΣΟΠ|ΞΕΦ|ΝΕΟΠ|ΝΟΜΟΤ|ΟΛΟΠ|ΟΜΟΤ|ΠΡΟΣΤ|ΠΡΟΣΩΠΟΠ|ΣΥΜΠ|ΣΥΝΤ|Τ|ΥΠΟΤ|ΧΑΡ|ΑΕΙΠ|ΑΙΜΟΣΤ|ΑΝΥΠ|ΑΠΟΤ|ΑΡΤΙΠ|ΔΙΑΤ|ΕΝ|ΕΠΙΤ|ΚΡΟΚΑΛΟΠ|ΣΙΔΗΡΟΠ|Λ|ΝΑΥ|ΟΥΛΑΜ|ΟΥΡ|Π|ΤΡ|Μ)$/.test(o[1])||/(ΟΦ|ΠΕΛ|ΧΟΡΤ|ΛΛ|ΣΦ|ΡΠ|ΦΡ|ΠΡ|ΛΟΧ|ΣΜΗΝ)$/.test(o[1])&&!/^(ΨΟΦ|ΝΑΥΛΟΧ)$/.test(o[1])||/(ΚΟΛΛ)$/.test(o[1]))&&(s+="ΑΓ")),null!==(o=/^(.+?)(ΗΣΕ|ΗΣΟΥ|ΗΣΑ)$/.exec(s))&&(s=o[1],/^(Ν|ΧΕΡΣΟΝ|ΔΩΔΕΚΑΝ|ΕΡΗΜΟΝ|ΜΕΓΑΛΟΝ|ΕΠΤΑΝ|Ι)$/.test(o[1])&&(s+="ΗΣ")),null!==(o=/^(.+?)(ΗΣΤΕ)$/.exec(s))&&(s=o[1],/^(ΑΣΒ|ΣΒ|ΑΧΡ|ΧΡ|ΑΠΛ|ΑΕΙΜΝ|ΔΥΣΧΡ|ΕΥΧΡ|ΚΟΙΝΟΧΡ|ΠΑΛΙΜΨ)$/.test(o[1])&&(s+="ΗΣΤ")),null!==(o=/^(.+?)(ΟΥΝΕ|ΗΣΟΥΝΕ|ΗΘΟΥΝΕ)$/.exec(s))&&(s=o[1],/^(Ν|Ρ|ΣΠΙ|ΣΤΡΑΒΟΜΟΥΤΣ|ΚΑΚΟΜΟΥΤΣ|ΕΞΩΝ)$/.test(o[1])&&(s+="ΟΥΝ")),null!==(o=/^(.+?)(ΟΥΜΕ|ΗΣΟΥΜΕ|ΗΘΟΥΜΕ)$/.exec(s))&&(s=o[1],/^(ΠΑΡΑΣΟΥΣ|Φ|Χ|ΩΡΙΟΠΛ|ΑΖ|ΑΛΛΟΣΟΥΣ|ΑΣΟΥΣ)$/.test(o[1])&&(s+="ΟΥΜ")),null!=(o=/^(.+?)(ΜΑΤΟΙ|ΜΑΤΟΥΣ|ΜΑΤΟ|ΜΑΤΑ|ΜΑΤΩΣ|ΜΑΤΩΝ|ΜΑΤΟΣ|ΜΑΤΕΣ|ΜΑΤΗ|ΜΑΤΗΣ|ΜΑΤΟΥ)$/.exec(s))&&(s=o[1]+"Μ",/^(ΓΡΑΜ)$/.test(o[1])?s+="Α":/^(ΓΕ|ΣΤΑ)$/.test(o[1])&&(s+="ΑΤ")),null!==(o=/^(.+?)(ΟΥΑ)$/.exec(s))&&(s=o[1]+"ΟΥ"),n.length===s.length&&null!==(o=/^(.+?)(Α|ΑΓΑΤΕ|ΑΓΑΝ|ΑΕΙ|ΑΜΑΙ|ΑΝ|ΑΣ|ΑΣΑΙ|ΑΤΑΙ|ΑΩ|Ε|ΕΙ|ΕΙΣ|ΕΙΤΕ|ΕΣΑΙ|ΕΣ|ΕΤΑΙ|Ι|ΙΕΜΑΙ|ΙΕΜΑΣΤΕ|ΙΕΤΑΙ|ΙΕΣΑΙ|ΙΕΣΑΣΤΕ|ΙΟΜΑΣΤΑΝ|ΙΟΜΟΥΝ|ΙΟΜΟΥΝΑ|ΙΟΝΤΑΝ|ΙΟΝΤΟΥΣΑΝ|ΙΟΣΑΣΤΑΝ|ΙΟΣΑΣΤΕ|ΙΟΣΟΥΝ|ΙΟΣΟΥΝΑ|ΙΟΤΑΝ|ΙΟΥΜΑ|ΙΟΥΜΑΣΤΕ|ΙΟΥΝΤΑΙ|ΙΟΥΝΤΑΝ|Η|ΗΔΕΣ|ΗΔΩΝ|ΗΘΕΙ|ΗΘΕΙΣ|ΗΘΕΙΤΕ|ΗΘΗΚΑΤΕ|ΗΘΗΚΑΝ|ΗΘΟΥΝ|ΗΘΩ|ΗΚΑΤΕ|ΗΚΑΝ|ΗΣ|ΗΣΑΝ|ΗΣΑΤΕ|ΗΣΕΙ|ΗΣΕΣ|ΗΣΟΥΝ|ΗΣΩ|Ο|ΟΙ|ΟΜΑΙ|ΟΜΑΣΤΑΝ|ΟΜΟΥΝ|ΟΜΟΥΝΑ|ΟΝΤΑΙ|ΟΝΤΑΝ|ΟΝΤΟΥΣΑΝ|ΟΣ|ΟΣΑΣΤΑΝ|ΟΣΑΣΤΕ|ΟΣΟΥΝ|ΟΣΟΥΝΑ|ΟΤΑΝ|ΟΥ|ΟΥΜΑΙ|ΟΥΜΑΣΤΕ|ΟΥΝ|ΟΥΝΤΑΙ|ΟΥΝΤΑΝ|ΟΥΣ|ΟΥΣΑΝ|ΟΥΣΑΤΕ|Υ||ΥΑ|ΥΣ|Ω|ΩΝ|ΟΙΣ)$/.exec(s))&&(s=o[1]),null!=(o=/^(.+?)(ΕΣΤΕΡ|ΕΣΤΑΤ|ΟΤΕΡ|ΟΤΑΤ|ΥΤΕΡ|ΥΤΑΤ|ΩΤΕΡ|ΩΤΑΤ)$/.exec(s))&&(/^(ΕΞ|ΕΣ|ΑΝ|ΚΑΤ|Κ|ΠΡ)$/.test(o[1])||(s=o[1]),/^(ΚΑ|Μ|ΕΛΕ|ΛΕ|ΔΕ)$/.test(o[1])&&(s+="ΥΤ")),s}var l={"ΦΑΓΙΑ":"ΦΑ","ΦΑΓΙΟΥ":"ΦΑ","ΦΑΓΙΩΝ":"ΦΑ","ΣΚΑΓΙΑ":"ΣΚΑ","ΣΚΑΓΙΟΥ":"ΣΚΑ","ΣΚΑΓΙΩΝ":"ΣΚΑ","ΣΟΓΙΟΥ":"ΣΟ","ΣΟΓΙΑ":"ΣΟ","ΣΟΓΙΩΝ":"ΣΟ","ΤΑΤΟΓΙΑ":"ΤΑΤΟ","ΤΑΤΟΓΙΟΥ":"ΤΑΤΟ","ΤΑΤΟΓΙΩΝ":"ΤΑΤΟ","ΚΡΕΑΣ":"ΚΡΕ","ΚΡΕΑΤΟΣ":"ΚΡΕ","ΚΡΕΑΤΑ":"ΚΡΕ","ΚΡΕΑΤΩΝ":"ΚΡΕ","ΠΕΡΑΣ":"ΠΕΡ","ΠΕΡΑΤΟΣ":"ΠΕΡ","ΠΕΡΑΤΑ":"ΠΕΡ","ΠΕΡΑΤΩΝ":"ΠΕΡ","ΤΕΡΑΣ":"ΤΕΡ","ΤΕΡΑΤΟΣ":"ΤΕΡ","ΤΕΡΑΤΑ":"ΤΕΡ","ΤΕΡΑΤΩΝ":"ΤΕΡ","ΦΩΣ":"ΦΩ","ΦΩΤΟΣ":"ΦΩ","ΦΩΤΑ":"ΦΩ","ΦΩΤΩΝ":"ΦΩ","ΚΑΘΕΣΤΩΣ":"ΚΑΘΕΣΤ","ΚΑΘΕΣΤΩΤΟΣ":"ΚΑΘΕΣΤ","ΚΑΘΕΣΤΩΤΑ":"ΚΑΘΕΣΤ","ΚΑΘΕΣΤΩΤΩΝ":"ΚΑΘΕΣΤ","ΓΕΓΟΝΟΣ":"ΓΕΓΟΝ","ΓΕΓΟΝΟΤΟΣ":"ΓΕΓΟΝ","ΓΕΓΟΝΟΤΑ":"ΓΕΓΟΝ","ΓΕΓΟΝΟΤΩΝ":"ΓΕΓΟΝ","ΕΥΑ":"ΕΥ"},i=["ΑΚΡΙΒΩΣ","ΑΛΑ","ΑΛΛΑ","ΑΛΛΙΩΣ","ΑΛΛΟΤΕ","ΑΜΑ","ΑΝΩ","ΑΝΑ","ΑΝΑΜΕΣΑ","ΑΝΑΜΕΤΑΞΥ","ΑΝΕΥ","ΑΝΤΙ","ΑΝΤΙΠΕΡΑ","ΑΝΤΙΟ","ΑΞΑΦΝΑ","ΑΠΟ","ΑΠΟΨΕ","ΑΡΑ","ΑΡΑΓΕ","ΑΥΡΙΟ","ΑΦΟΙ","ΑΦΟΥ","ΑΦΟΤΟΥ","ΒΡΕ","ΓΕΙΑ","ΓΙΑ","ΓΙΑΤΙ","ΓΡΑΜΜΑ","ΔΕΗ","ΔΕΝ","ΔΗΛΑΔΗ","ΔΙΧΩΣ","ΔΥΟ","ΕΑΝ","ΕΓΩ","ΕΔΩ","ΕΔΑ","ΕΙΘΕ","ΕΙΜΑΙ","ΕΙΜΑΣΤΕ","ΕΙΣΑΙ","ΕΙΣΑΣΤΕ","ΕΙΝΑΙ","ΕΙΣΤΕ","ΕΙΤΕ","ΕΚΕΙ","ΕΚΟ","ΕΛΑ","ΕΜΑΣ","ΕΜΕΙΣ","ΕΝΤΕΛΩΣ","ΕΝΤΟΣ","ΕΝΤΩΜΕΤΑΞΥ","ΕΝΩ","ΕΞΙ","ΕΞΙΣΟΥ","ΕΞΗΣ","ΕΞΩ","ΕΟΚ","ΕΠΑΝΩ","ΕΠΕΙΔΗ","ΕΠΕΙΤΑ","ΕΠΙ","ΕΠΙΣΗΣ","ΕΠΟΜΕΝΩΣ","ΕΠΤΑ","ΕΣΑΣ","ΕΣΕΙΣ","ΕΣΤΩ","ΕΣΥ","ΕΣΩ","ΕΤΣΙ","ΕΥΓΕ","ΕΦΕ","ΕΦΕΞΗΣ","ΕΧΤΕΣ","ΕΩΣ","ΗΔΗ","ΗΜΙ","ΗΠΑ","ΗΤΟΙ","ΘΕΣ","ΙΔΙΩΣ","ΙΔΗ","ΙΚΑ","ΙΣΩΣ","ΚΑΘΕ","ΚΑΘΕΤΙ","ΚΑΘΟΛΟΥ","ΚΑΘΩΣ","ΚΑΙ","ΚΑΝ","ΚΑΠΟΤΕ","ΚΑΠΟΥ","ΚΑΤΑ","ΚΑΤΙ","ΚΑΤΟΠΙΝ","ΚΑΤΩ","ΚΕΙ","ΚΙΧ","ΚΚΕ","ΚΟΛΑΝ","ΚΥΡΙΩΣ","ΚΩΣ","ΜΑΚΑΡΙ","ΜΑΛΙΣΤΑ","ΜΑΛΛΟΝ","ΜΑΙ","ΜΑΟ","ΜΑΟΥΣ","ΜΑΣ","ΜΕΘΑΥΡΙΟ","ΜΕΣ","ΜΕΣΑ","ΜΕΤΑ","ΜΕΤΑΞΥ","ΜΕΧΡΙ","ΜΗΔΕ","ΜΗΝ","ΜΗΠΩΣ","ΜΗΤΕ","ΜΙΑ","ΜΙΑΣ","ΜΙΣ","ΜΜΕ","ΜΟΛΟΝΟΤΙ","ΜΟΥ","ΜΠΑ","ΜΠΑΣ","ΜΠΟΥΦΑΝ","ΜΠΡΟΣ","ΝΑΙ","ΝΕΣ","ΝΤΑ","ΝΤΕ","ΞΑΝΑ","ΟΗΕ","ΟΚΤΩ","ΟΜΩΣ","ΟΝΕ","ΟΠΑ","ΟΠΟΥ","ΟΠΩΣ","ΟΣΟ","ΟΤΑΝ","ΟΤΕ","ΟΤΙ","ΟΥΤΕ","ΟΧΙ","ΠΑΛΙ","ΠΑΝ","ΠΑΝΟ","ΠΑΝΤΟΤΕ","ΠΑΝΤΟΥ","ΠΑΝΤΩΣ","ΠΑΝΩ","ΠΑΡΑ","ΠΕΡΑ","ΠΕΡΙ","ΠΕΡΙΠΟΥ","ΠΙΑ","ΠΙΟ","ΠΙΣΩ","ΠΛΑΙ","ΠΛΕΟΝ","ΠΛΗΝ","ΠΟΤΕ","ΠΟΥ","ΠΡΟ","ΠΡΟΣ","ΠΡΟΧΤΕΣ","ΠΡΟΧΘΕΣ","ΡΟΔΙ","ΠΩΣ","ΣΑΙ","ΣΑΣ","ΣΑΝ","ΣΕΙΣ","ΣΙΑ","ΣΚΙ","ΣΟΙ","ΣΟΥ","ΣΡΙ","ΣΥΝ","ΣΥΝΑΜΑ","ΣΧΕΔΟΝ","ΤΑΔΕ","ΤΑΞΙ","ΤΑΧΑ","ΤΕΙ","ΤΗΝ","ΤΗΣ","ΤΙΠΟΤΑ","ΤΙΠΟΤΕ","ΤΙΣ","ΤΟΝ","ΤΟΤΕ","ΤΟΥ","ΤΟΥΣ","ΤΣΑ","ΤΣΕ","ΤΣΙ","ΤΣΟΥ","ΤΩΝ","ΥΠΟ","ΥΠΟΨΗ","ΥΠΟΨΙΝ","ΥΣΤΕΡΑ","ΦΕΤΟΣ","ΦΙΣ","ΦΠΑ","ΧΑΦ","ΧΘΕΣ","ΧΤΕΣ","ΧΩΡΙΣ","ΩΣ","ΩΣΑΝ","ΩΣΟΤΟΥ","ΩΣΠΟΥ","ΩΣΤΕ","ΩΣΤΟΣΟ"],s=new RegExp("^[ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]+$");return function(e){return"function"==typeof e.update?e.update(function(e){return n(e.toUpperCase()).toLowerCase()}):n(e.toUpperCase()).toLowerCase()}}(),e.Pipeline.registerFunction(e.el.stemmer,"stemmer-el"),e.el.stopWordFilter=e.generateStopWordFilter("αλλα αν αντι απο αυτα αυτεσ αυτη αυτο αυτοι αυτοσ αυτουσ αυτων για δε δεν εαν ειμαι ειμαστε ειναι εισαι ειστε εκεινα εκεινεσ εκεινη εκεινο εκεινοι εκεινοσ εκεινουσ εκεινων ενω επι η θα ισωσ κ και κατα κι μα με μετα μη μην να ο οι ομωσ οπωσ οσο οτι παρα ποια ποιεσ ποιο ποιοι ποιοσ ποιουσ ποιων που προσ πωσ σε στη στην στο στον τα την τησ το τον τοτε του των ωσ".split(" ")),e.Pipeline.registerFunction(e.el.stopWordFilter,"stopWordFilter-el"),e.el.normilizer=function(){var e={"Ά":"Α","ά":"α","Έ":"Ε","έ":"ε","Ή":"Η","ή":"η","Ί":"Ι","ί":"ι","Ό":"Ο","ο":"ο","Ύ":"Υ","ύ":"υ","Ώ":"Ω","ώ":"ω","Ϊ":"Ι","ϊ":"ι","Ϋ":"Υ","ϋ":"υ","ΐ":"ι","ΰ":"υ"};return function(t){if("function"==typeof t.update)return t.update(function(t){for(var r="",n=0;n<t.length;n++)r+=e[t.charAt(n)]||t.charAt(n);return r});for(var r="",n=0;n<t.length;n++)r+=e[t.charAt(n)]||t.charAt(n);return r}}(),e.Pipeline.registerFunction(e.el.normilizer,"normilizer-el")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.es.min.js b/assets/javascripts/lunr/min/lunr.es.min.js
new file mode 100644
index 0000000..2989d34
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.es.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `Spanish` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(e,s){"function"==typeof define&&define.amd?define(s):"object"==typeof exports?module.exports=s():s()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.es=function(){this.pipeline.reset(),this.pipeline.add(e.es.trimmer,e.es.stopWordFilter,e.es.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.es.stemmer))},e.es.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.es.trimmer=e.trimmerSupport.generateTrimmer(e.es.wordCharacters),e.Pipeline.registerFunction(e.es.trimmer,"trimmer-es"),e.es.stemmer=function(){var s=e.stemmerSupport.Among,r=e.stemmerSupport.SnowballProgram,n=new function(){function e(){if(A.out_grouping(x,97,252)){for(;!A.in_grouping(x,97,252);){if(A.cursor>=A.limit)return!0;A.cursor++}return!1}return!0}function n(){if(A.in_grouping(x,97,252)){var s=A.cursor;if(e()){if(A.cursor=s,!A.in_grouping(x,97,252))return!0;for(;!A.out_grouping(x,97,252);){if(A.cursor>=A.limit)return!0;A.cursor++}}return!1}return!0}function i(){var s,r=A.cursor;if(n()){if(A.cursor=r,!A.out_grouping(x,97,252))return;if(s=A.cursor,e()){if(A.cursor=s,!A.in_grouping(x,97,252)||A.cursor>=A.limit)return;A.cursor++}}g=A.cursor}function a(){for(;!A.in_grouping(x,97,252);){if(A.cursor>=A.limit)return!1;A.cursor++}for(;!A.out_grouping(x,97,252);){if(A.cursor>=A.limit)return!1;A.cursor++}return!0}function t(){var e=A.cursor;g=A.limit,p=g,v=g,i(),A.cursor=e,a()&&(p=A.cursor,a()&&(v=A.cursor))}function o(){for(var e;;){if(A.bra=A.cursor,e=A.find_among(k,6))switch(A.ket=A.cursor,e){case 1:A.slice_from("a");continue;case 2:A.slice_from("e");continue;case 3:A.slice_from("i");continue;case 4:A.slice_from("o");continue;case 5:A.slice_from("u");continue;case 6:if(A.cursor>=A.limit)break;A.cursor++;continue}break}}function u(){return g<=A.cursor}function w(){return p<=A.cursor}function c(){return v<=A.cursor}function m(){var e;if(A.ket=A.cursor,A.find_among_b(y,13)&&(A.bra=A.cursor,(e=A.find_among_b(q,11))&&u()))switch(e){case 1:A.bra=A.cursor,A.slice_from("iendo");break;case 2:A.bra=A.cursor,A.slice_from("ando");break;case 3:A.bra=A.cursor,A.slice_from("ar");break;case 4:A.bra=A.cursor,A.slice_from("er");break;case 5:A.bra=A.cursor,A.slice_from("ir");break;case 6:A.slice_del();break;case 7:A.eq_s_b(1,"u")&&A.slice_del()}}function l(e,s){if(!c())return!0;A.slice_del(),A.ket=A.cursor;var r=A.find_among_b(e,s);return r&&(A.bra=A.cursor,1==r&&c()&&A.slice_del()),!1}function d(e){return!c()||(A.slice_del(),A.ket=A.cursor,A.eq_s_b(2,e)&&(A.bra=A.cursor,c()&&A.slice_del()),!1)}function b(){var e;if(A.ket=A.cursor,e=A.find_among_b(S,46)){switch(A.bra=A.cursor,e){case 1:if(!c())return!1;A.slice_del();break;case 2:if(d("ic"))return!1;break;case 3:if(!c())return!1;A.slice_from("log");break;case 4:if(!c())return!1;A.slice_from("u");break;case 5:if(!c())return!1;A.slice_from("ente");break;case 6:if(!w())return!1;A.slice_del(),A.ket=A.cursor,e=A.find_among_b(C,4),e&&(A.bra=A.cursor,c()&&(A.slice_del(),1==e&&(A.ket=A.cursor,A.eq_s_b(2,"at")&&(A.bra=A.cursor,c()&&A.slice_del()))));break;case 7:if(l(P,3))return!1;break;case 8:if(l(F,3))return!1;break;case 9:if(d("at"))return!1}return!0}return!1}function f(){var e,s;if(A.cursor>=g&&(s=A.limit_backward,A.limit_backward=g,A.ket=A.cursor,e=A.find_among_b(W,12),A.limit_backward=s,e)){if(A.bra=A.cursor,1==e){if(!A.eq_s_b(1,"u"))return!1;A.slice_del()}return!0}return!1}function _(){var e,s,r,n;if(A.cursor>=g&&(s=A.limit_backward,A.limit_backward=g,A.ket=A.cursor,e=A.find_among_b(L,96),A.limit_backward=s,e))switch(A.bra=A.cursor,e){case 1:r=A.limit-A.cursor,A.eq_s_b(1,"u")?(n=A.limit-A.cursor,A.eq_s_b(1,"g")?A.cursor=A.limit-n:A.cursor=A.limit-r):A.cursor=A.limit-r,A.bra=A.cursor;case 2:A.slice_del()}}function h(){var e,s;if(A.ket=A.cursor,e=A.find_among_b(z,8))switch(A.bra=A.cursor,e){case 1:u()&&A.slice_del();break;case 2:u()&&(A.slice_del(),A.ket=A.cursor,A.eq_s_b(1,"u")&&(A.bra=A.cursor,s=A.limit-A.cursor,A.eq_s_b(1,"g")&&(A.cursor=A.limit-s,u()&&A.slice_del())))}}var v,p,g,k=[new s("",-1,6),new s("á",0,1),new s("é",0,2),new s("í",0,3),new s("ó",0,4),new s("ú",0,5)],y=[new s("la",-1,-1),new s("sela",0,-1),new s("le",-1,-1),new s("me",-1,-1),new s("se",-1,-1),new s("lo",-1,-1),new s("selo",5,-1),new s("las",-1,-1),new s("selas",7,-1),new s("les",-1,-1),new s("los",-1,-1),new s("selos",10,-1),new s("nos",-1,-1)],q=[new s("ando",-1,6),new s("iendo",-1,6),new s("yendo",-1,7),new s("ándo",-1,2),new s("iéndo",-1,1),new s("ar",-1,6),new s("er",-1,6),new s("ir",-1,6),new s("ár",-1,3),new s("ér",-1,4),new s("ír",-1,5)],C=[new s("ic",-1,-1),new s("ad",-1,-1),new s("os",-1,-1),new s("iv",-1,1)],P=[new s("able",-1,1),new s("ible",-1,1),new s("ante",-1,1)],F=[new s("ic",-1,1),new s("abil",-1,1),new s("iv",-1,1)],S=[new s("ica",-1,1),new s("ancia",-1,2),new s("encia",-1,5),new s("adora",-1,2),new s("osa",-1,1),new s("ista",-1,1),new s("iva",-1,9),new s("anza",-1,1),new s("logía",-1,3),new s("idad",-1,8),new s("able",-1,1),new s("ible",-1,1),new s("ante",-1,2),new s("mente",-1,7),new s("amente",13,6),new s("ación",-1,2),new s("ución",-1,4),new s("ico",-1,1),new s("ismo",-1,1),new s("oso",-1,1),new s("amiento",-1,1),new s("imiento",-1,1),new s("ivo",-1,9),new s("ador",-1,2),new s("icas",-1,1),new s("ancias",-1,2),new s("encias",-1,5),new s("adoras",-1,2),new s("osas",-1,1),new s("istas",-1,1),new s("ivas",-1,9),new s("anzas",-1,1),new s("logías",-1,3),new s("idades",-1,8),new s("ables",-1,1),new s("ibles",-1,1),new s("aciones",-1,2),new s("uciones",-1,4),new s("adores",-1,2),new s("antes",-1,2),new s("icos",-1,1),new s("ismos",-1,1),new s("osos",-1,1),new s("amientos",-1,1),new s("imientos",-1,1),new s("ivos",-1,9)],W=[new s("ya",-1,1),new s("ye",-1,1),new s("yan",-1,1),new s("yen",-1,1),new s("yeron",-1,1),new s("yendo",-1,1),new s("yo",-1,1),new s("yas",-1,1),new s("yes",-1,1),new s("yais",-1,1),new s("yamos",-1,1),new s("yó",-1,1)],L=[new s("aba",-1,2),new s("ada",-1,2),new s("ida",-1,2),new s("ara",-1,2),new s("iera",-1,2),new s("ía",-1,2),new s("aría",5,2),new s("ería",5,2),new s("iría",5,2),new s("ad",-1,2),new s("ed",-1,2),new s("id",-1,2),new s("ase",-1,2),new s("iese",-1,2),new s("aste",-1,2),new s("iste",-1,2),new s("an",-1,2),new s("aban",16,2),new s("aran",16,2),new s("ieran",16,2),new s("ían",16,2),new s("arían",20,2),new s("erían",20,2),new s("irían",20,2),new s("en",-1,1),new s("asen",24,2),new s("iesen",24,2),new s("aron",-1,2),new s("ieron",-1,2),new s("arán",-1,2),new s("erán",-1,2),new s("irán",-1,2),new s("ado",-1,2),new s("ido",-1,2),new s("ando",-1,2),new s("iendo",-1,2),new s("ar",-1,2),new s("er",-1,2),new s("ir",-1,2),new s("as",-1,2),new s("abas",39,2),new s("adas",39,2),new s("idas",39,2),new s("aras",39,2),new s("ieras",39,2),new s("ías",39,2),new s("arías",45,2),new s("erías",45,2),new s("irías",45,2),new s("es",-1,1),new s("ases",49,2),new s("ieses",49,2),new s("abais",-1,2),new s("arais",-1,2),new s("ierais",-1,2),new s("íais",-1,2),new s("aríais",55,2),new s("eríais",55,2),new s("iríais",55,2),new s("aseis",-1,2),new s("ieseis",-1,2),new s("asteis",-1,2),new s("isteis",-1,2),new s("áis",-1,2),new s("éis",-1,1),new s("aréis",64,2),new s("eréis",64,2),new s("iréis",64,2),new s("ados",-1,2),new s("idos",-1,2),new s("amos",-1,2),new s("ábamos",70,2),new s("áramos",70,2),new s("iéramos",70,2),new s("íamos",70,2),new s("aríamos",74,2),new s("eríamos",74,2),new s("iríamos",74,2),new s("emos",-1,1),new s("aremos",78,2),new s("eremos",78,2),new s("iremos",78,2),new s("ásemos",78,2),new s("iésemos",78,2),new s("imos",-1,2),new s("arás",-1,2),new s("erás",-1,2),new s("irás",-1,2),new s("ís",-1,2),new s("ará",-1,2),new s("erá",-1,2),new s("irá",-1,2),new s("aré",-1,2),new s("eré",-1,2),new s("iré",-1,2),new s("ió",-1,2)],z=[new s("a",-1,1),new s("e",-1,2),new s("o",-1,1),new s("os",-1,1),new s("á",-1,1),new s("é",-1,2),new s("í",-1,1),new s("ó",-1,1)],x=[17,65,16,0,0,0,0,0,0,0,0,0,0,0,0,0,1,17,4,10],A=new r;this.setCurrent=function(e){A.setCurrent(e)},this.getCurrent=function(){return A.getCurrent()},this.stem=function(){var e=A.cursor;return t(),A.limit_backward=e,A.cursor=A.limit,m(),A.cursor=A.limit,b()||(A.cursor=A.limit,f()||(A.cursor=A.limit,_())),A.cursor=A.limit,h(),A.cursor=A.limit_backward,o(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return n.setCurrent(e),n.stem(),n.getCurrent()}):(n.setCurrent(e),n.stem(),n.getCurrent())}}(),e.Pipeline.registerFunction(e.es.stemmer,"stemmer-es"),e.es.stopWordFilter=e.generateStopWordFilter("a al algo algunas algunos ante antes como con contra cual cuando de del desde donde durante e el ella ellas ellos en entre era erais eran eras eres es esa esas ese eso esos esta estaba estabais estaban estabas estad estada estadas estado estados estamos estando estar estaremos estará estarán estarás estaré estaréis estaría estaríais estaríamos estarían estarías estas este estemos esto estos estoy estuve estuviera estuvierais estuvieran estuvieras estuvieron estuviese estuvieseis estuviesen estuvieses estuvimos estuviste estuvisteis estuviéramos estuviésemos estuvo está estábamos estáis están estás esté estéis estén estés fue fuera fuerais fueran fueras fueron fuese fueseis fuesen fueses fui fuimos fuiste fuisteis fuéramos fuésemos ha habida habidas habido habidos habiendo habremos habrá habrán habrás habré habréis habría habríais habríamos habrían habrías habéis había habíais habíamos habían habías han has hasta hay haya hayamos hayan hayas hayáis he hemos hube hubiera hubierais hubieran hubieras hubieron hubiese hubieseis hubiesen hubieses hubimos hubiste hubisteis hubiéramos hubiésemos hubo la las le les lo los me mi mis mucho muchos muy más mí mía mías mío míos nada ni no nos nosotras nosotros nuestra nuestras nuestro nuestros o os otra otras otro otros para pero poco por porque que quien quienes qué se sea seamos sean seas seremos será serán serás seré seréis sería seríais seríamos serían serías seáis sido siendo sin sobre sois somos son soy su sus suya suyas suyo suyos sí también tanto te tendremos tendrá tendrán tendrás tendré tendréis tendría tendríais tendríamos tendrían tendrías tened tenemos tenga tengamos tengan tengas tengo tengáis tenida tenidas tenido tenidos teniendo tenéis tenía teníais teníamos tenían tenías ti tiene tienen tienes todo todos tu tus tuve tuviera tuvierais tuvieran tuvieras tuvieron tuviese tuvieseis tuviesen tuvieses tuvimos tuviste tuvisteis tuviéramos tuviésemos tuvo tuya tuyas tuyo tuyos tú un una uno unos vosotras vosotros vuestra vuestras vuestro vuestros y ya yo él éramos".split(" ")),e.Pipeline.registerFunction(e.es.stopWordFilter,"stopWordFilter-es")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.fi.min.js b/assets/javascripts/lunr/min/lunr.fi.min.js
new file mode 100644
index 0000000..29f5dfc
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.fi.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `Finnish` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(i,e){"function"==typeof define&&define.amd?define(e):"object"==typeof exports?module.exports=e():e()(i.lunr)}(this,function(){return function(i){if(void 0===i)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===i.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");i.fi=function(){this.pipeline.reset(),this.pipeline.add(i.fi.trimmer,i.fi.stopWordFilter,i.fi.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(i.fi.stemmer))},i.fi.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",i.fi.trimmer=i.trimmerSupport.generateTrimmer(i.fi.wordCharacters),i.Pipeline.registerFunction(i.fi.trimmer,"trimmer-fi"),i.fi.stemmer=function(){var e=i.stemmerSupport.Among,r=i.stemmerSupport.SnowballProgram,n=new function(){function i(){f=A.limit,d=f,n()||(f=A.cursor,n()||(d=A.cursor))}function n(){for(var i;;){if(i=A.cursor,A.in_grouping(W,97,246))break;if(A.cursor=i,i>=A.limit)return!0;A.cursor++}for(A.cursor=i;!A.out_grouping(W,97,246);){if(A.cursor>=A.limit)return!0;A.cursor++}return!1}function t(){return d<=A.cursor}function s(){var i,e;if(A.cursor>=f)if(e=A.limit_backward,A.limit_backward=f,A.ket=A.cursor,i=A.find_among_b(h,10)){switch(A.bra=A.cursor,A.limit_backward=e,i){case 1:if(!A.in_grouping_b(x,97,246))return;break;case 2:if(!t())return}A.slice_del()}else A.limit_backward=e}function o(){var i,e,r;if(A.cursor>=f)if(e=A.limit_backward,A.limit_backward=f,A.ket=A.cursor,i=A.find_among_b(v,9))switch(A.bra=A.cursor,A.limit_backward=e,i){case 1:r=A.limit-A.cursor,A.eq_s_b(1,"k")||(A.cursor=A.limit-r,A.slice_del());break;case 2:A.slice_del(),A.ket=A.cursor,A.eq_s_b(3,"kse")&&(A.bra=A.cursor,A.slice_from("ksi"));break;case 3:A.slice_del();break;case 4:A.find_among_b(p,6)&&A.slice_del();break;case 5:A.find_among_b(g,6)&&A.slice_del();break;case 6:A.find_among_b(j,2)&&A.slice_del()}else A.limit_backward=e}function l(){return A.find_among_b(q,7)}function a(){return A.eq_s_b(1,"i")&&A.in_grouping_b(L,97,246)}function u(){var i,e,r;if(A.cursor>=f)if(e=A.limit_backward,A.limit_backward=f,A.ket=A.cursor,i=A.find_among_b(C,30)){switch(A.bra=A.cursor,A.limit_backward=e,i){case 1:if(!A.eq_s_b(1,"a"))return;break;case 2:case 9:if(!A.eq_s_b(1,"e"))return;break;case 3:if(!A.eq_s_b(1,"i"))return;break;case 4:if(!A.eq_s_b(1,"o"))return;break;case 5:if(!A.eq_s_b(1,"ä"))return;break;case 6:if(!A.eq_s_b(1,"ö"))return;break;case 7:if(r=A.limit-A.cursor,!l()&&(A.cursor=A.limit-r,!A.eq_s_b(2,"ie"))){A.cursor=A.limit-r;break}if(A.cursor=A.limit-r,A.cursor<=A.limit_backward){A.cursor=A.limit-r;break}A.cursor--,A.bra=A.cursor;break;case 8:if(!A.in_grouping_b(W,97,246)||!A.out_grouping_b(W,97,246))return}A.slice_del(),k=!0}else A.limit_backward=e}function c(){var i,e,r;if(A.cursor>=d)if(e=A.limit_backward,A.limit_backward=d,A.ket=A.cursor,i=A.find_among_b(P,14)){if(A.bra=A.cursor,A.limit_backward=e,1==i){if(r=A.limit-A.cursor,A.eq_s_b(2,"po"))return;A.cursor=A.limit-r}A.slice_del()}else A.limit_backward=e}function m(){var i;A.cursor>=f&&(i=A.limit_backward,A.limit_backward=f,A.ket=A.cursor,A.find_among_b(F,2)?(A.bra=A.cursor,A.limit_backward=i,A.slice_del()):A.limit_backward=i)}function w(){var i,e,r,n,t,s;if(A.cursor>=f){if(e=A.limit_backward,A.limit_backward=f,A.ket=A.cursor,A.eq_s_b(1,"t")&&(A.bra=A.cursor,r=A.limit-A.cursor,A.in_grouping_b(W,97,246)&&(A.cursor=A.limit-r,A.slice_del(),A.limit_backward=e,n=A.limit-A.cursor,A.cursor>=d&&(A.cursor=d,t=A.limit_backward,A.limit_backward=A.cursor,A.cursor=A.limit-n,A.ket=A.cursor,i=A.find_among_b(S,2))))){if(A.bra=A.cursor,A.limit_backward=t,1==i){if(s=A.limit-A.cursor,A.eq_s_b(2,"po"))return;A.cursor=A.limit-s}return void A.slice_del()}A.limit_backward=e}}function _(){var i,e,r,n;if(A.cursor>=f){for(i=A.limit_backward,A.limit_backward=f,e=A.limit-A.cursor,l()&&(A.cursor=A.limit-e,A.ket=A.cursor,A.cursor>A.limit_backward&&(A.cursor--,A.bra=A.cursor,A.slice_del())),A.cursor=A.limit-e,A.ket=A.cursor,A.in_grouping_b(y,97,228)&&(A.bra=A.cursor,A.out_grouping_b(W,97,246)&&A.slice_del()),A.cursor=A.limit-e,A.ket=A.cursor,A.eq_s_b(1,"j")&&(A.bra=A.cursor,r=A.limit-A.cursor,A.eq_s_b(1,"o")?A.slice_del():(A.cursor=A.limit-r,A.eq_s_b(1,"u")&&A.slice_del())),A.cursor=A.limit-e,A.ket=A.cursor,A.eq_s_b(1,"o")&&(A.bra=A.cursor,A.eq_s_b(1,"j")&&A.slice_del()),A.cursor=A.limit-e,A.limit_backward=i;;){if(n=A.limit-A.cursor,A.out_grouping_b(W,97,246)){A.cursor=A.limit-n;break}if(A.cursor=A.limit-n,A.cursor<=A.limit_backward)return;A.cursor--}A.ket=A.cursor,A.cursor>A.limit_backward&&(A.cursor--,A.bra=A.cursor,b=A.slice_to(),A.eq_v_b(b)&&A.slice_del())}}var k,b,d,f,h=[new e("pa",-1,1),new e("sti",-1,2),new e("kaan",-1,1),new e("han",-1,1),new e("kin",-1,1),new e("hän",-1,1),new e("kään",-1,1),new e("ko",-1,1),new e("pä",-1,1),new e("kö",-1,1)],p=[new e("lla",-1,-1),new e("na",-1,-1),new e("ssa",-1,-1),new e("ta",-1,-1),new e("lta",3,-1),new e("sta",3,-1)],g=[new e("llä",-1,-1),new e("nä",-1,-1),new e("ssä",-1,-1),new e("tä",-1,-1),new e("ltä",3,-1),new e("stä",3,-1)],j=[new e("lle",-1,-1),new e("ine",-1,-1)],v=[new e("nsa",-1,3),new e("mme",-1,3),new e("nne",-1,3),new e("ni",-1,2),new e("si",-1,1),new e("an",-1,4),new e("en",-1,6),new e("än",-1,5),new e("nsä",-1,3)],q=[new e("aa",-1,-1),new e("ee",-1,-1),new e("ii",-1,-1),new e("oo",-1,-1),new e("uu",-1,-1),new e("ää",-1,-1),new e("öö",-1,-1)],C=[new e("a",-1,8),new e("lla",0,-1),new e("na",0,-1),new e("ssa",0,-1),new e("ta",0,-1),new e("lta",4,-1),new e("sta",4,-1),new e("tta",4,9),new e("lle",-1,-1),new e("ine",-1,-1),new e("ksi",-1,-1),new e("n",-1,7),new e("han",11,1),new e("den",11,-1,a),new e("seen",11,-1,l),new e("hen",11,2),new e("tten",11,-1,a),new e("hin",11,3),new e("siin",11,-1,a),new e("hon",11,4),new e("hän",11,5),new e("hön",11,6),new e("ä",-1,8),new e("llä",22,-1),new e("nä",22,-1),new e("ssä",22,-1),new e("tä",22,-1),new e("ltä",26,-1),new e("stä",26,-1),new e("ttä",26,9)],P=[new e("eja",-1,-1),new e("mma",-1,1),new e("imma",1,-1),new e("mpa",-1,1),new e("impa",3,-1),new e("mmi",-1,1),new e("immi",5,-1),new e("mpi",-1,1),new e("impi",7,-1),new e("ejä",-1,-1),new e("mmä",-1,1),new e("immä",10,-1),new e("mpä",-1,1),new e("impä",12,-1)],F=[new e("i",-1,-1),new e("j",-1,-1)],S=[new e("mma",-1,1),new e("imma",0,-1)],y=[17,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8],W=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,8,0,32],L=[17,65,16,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,32],x=[17,97,24,1,0,0,0,0,0,0,0,0,0,0,0,0,8,0,32],A=new r;this.setCurrent=function(i){A.setCurrent(i)},this.getCurrent=function(){return A.getCurrent()},this.stem=function(){var e=A.cursor;return i(),k=!1,A.limit_backward=e,A.cursor=A.limit,s(),A.cursor=A.limit,o(),A.cursor=A.limit,u(),A.cursor=A.limit,c(),A.cursor=A.limit,k?(m(),A.cursor=A.limit):(A.cursor=A.limit,w(),A.cursor=A.limit),_(),!0}};return function(i){return"function"==typeof i.update?i.update(function(i){return n.setCurrent(i),n.stem(),n.getCurrent()}):(n.setCurrent(i),n.stem(),n.getCurrent())}}(),i.Pipeline.registerFunction(i.fi.stemmer,"stemmer-fi"),i.fi.stopWordFilter=i.generateStopWordFilter("ei eivät emme en et ette että he heidän heidät heihin heille heillä heiltä heissä heistä heitä hän häneen hänelle hänellä häneltä hänen hänessä hänestä hänet häntä itse ja johon joiden joihin joiksi joilla joille joilta joina joissa joista joita joka joksi jolla jolle jolta jona jonka jos jossa josta jota jotka kanssa keiden keihin keiksi keille keillä keiltä keinä keissä keistä keitä keneen keneksi kenelle kenellä keneltä kenen kenenä kenessä kenestä kenet ketkä ketkä ketä koska kuin kuka kun me meidän meidät meihin meille meillä meiltä meissä meistä meitä mihin miksi mikä mille millä miltä minkä minkä minua minulla minulle minulta minun minussa minusta minut minuun minä minä missä mistä mitkä mitä mukaan mutta ne niiden niihin niiksi niille niillä niiltä niin niin niinä niissä niistä niitä noiden noihin noiksi noilla noille noilta noin noina noissa noista noita nuo nyt näiden näihin näiksi näille näillä näiltä näinä näissä näistä näitä nämä ole olemme olen olet olette oli olimme olin olisi olisimme olisin olisit olisitte olisivat olit olitte olivat olla olleet ollut on ovat poikki se sekä sen siihen siinä siitä siksi sille sillä sillä siltä sinua sinulla sinulle sinulta sinun sinussa sinusta sinut sinuun sinä sinä sitä tai te teidän teidät teihin teille teillä teiltä teissä teistä teitä tuo tuohon tuoksi tuolla tuolle tuolta tuon tuona tuossa tuosta tuota tähän täksi tälle tällä tältä tämä tämän tänä tässä tästä tätä vaan vai vaikka yli".split(" ")),i.Pipeline.registerFunction(i.fi.stopWordFilter,"stopWordFilter-fi")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.fr.min.js b/assets/javascripts/lunr/min/lunr.fr.min.js
new file mode 100644
index 0000000..68cd009
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.fr.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `French` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.fr=function(){this.pipeline.reset(),this.pipeline.add(e.fr.trimmer,e.fr.stopWordFilter,e.fr.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.fr.stemmer))},e.fr.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.fr.trimmer=e.trimmerSupport.generateTrimmer(e.fr.wordCharacters),e.Pipeline.registerFunction(e.fr.trimmer,"trimmer-fr"),e.fr.stemmer=function(){var r=e.stemmerSupport.Among,s=e.stemmerSupport.SnowballProgram,i=new function(){function e(e,r,s){return!(!W.eq_s(1,e)||(W.ket=W.cursor,!W.in_grouping(F,97,251)))&&(W.slice_from(r),W.cursor=s,!0)}function i(e,r,s){return!!W.eq_s(1,e)&&(W.ket=W.cursor,W.slice_from(r),W.cursor=s,!0)}function n(){for(var r,s;;){if(r=W.cursor,W.in_grouping(F,97,251)){if(W.bra=W.cursor,s=W.cursor,e("u","U",r))continue;if(W.cursor=s,e("i","I",r))continue;if(W.cursor=s,i("y","Y",r))continue}if(W.cursor=r,W.bra=r,!e("y","Y",r)){if(W.cursor=r,W.eq_s(1,"q")&&(W.bra=W.cursor,i("u","U",r)))continue;if(W.cursor=r,r>=W.limit)return;W.cursor++}}}function t(){for(;!W.in_grouping(F,97,251);){if(W.cursor>=W.limit)return!0;W.cursor++}for(;!W.out_grouping(F,97,251);){if(W.cursor>=W.limit)return!0;W.cursor++}return!1}function u(){var e=W.cursor;if(q=W.limit,g=q,p=q,W.in_grouping(F,97,251)&&W.in_grouping(F,97,251)&&W.cursor<W.limit)W.cursor++;else if(W.cursor=e,!W.find_among(v,3)){W.cursor=e;do{if(W.cursor>=W.limit){W.cursor=q;break}W.cursor++}while(!W.in_grouping(F,97,251))}q=W.cursor,W.cursor=e,t()||(g=W.cursor,t()||(p=W.cursor))}function o(){for(var e,r;;){if(r=W.cursor,W.bra=r,!(e=W.find_among(h,4)))break;switch(W.ket=W.cursor,e){case 1:W.slice_from("i");break;case 2:W.slice_from("u");break;case 3:W.slice_from("y");break;case 4:if(W.cursor>=W.limit)return;W.cursor++}}}function c(){return q<=W.cursor}function a(){return g<=W.cursor}function l(){return p<=W.cursor}function w(){var e,r;if(W.ket=W.cursor,e=W.find_among_b(C,43)){switch(W.bra=W.cursor,e){case 1:if(!l())return!1;W.slice_del();break;case 2:if(!l())return!1;W.slice_del(),W.ket=W.cursor,W.eq_s_b(2,"ic")&&(W.bra=W.cursor,l()?W.slice_del():W.slice_from("iqU"));break;case 3:if(!l())return!1;W.slice_from("log");break;case 4:if(!l())return!1;W.slice_from("u");break;case 5:if(!l())return!1;W.slice_from("ent");break;case 6:if(!c())return!1;if(W.slice_del(),W.ket=W.cursor,e=W.find_among_b(z,6))switch(W.bra=W.cursor,e){case 1:l()&&(W.slice_del(),W.ket=W.cursor,W.eq_s_b(2,"at")&&(W.bra=W.cursor,l()&&W.slice_del()));break;case 2:l()?W.slice_del():a()&&W.slice_from("eux");break;case 3:l()&&W.slice_del();break;case 4:c()&&W.slice_from("i")}break;case 7:if(!l())return!1;if(W.slice_del(),W.ket=W.cursor,e=W.find_among_b(y,3))switch(W.bra=W.cursor,e){case 1:l()?W.slice_del():W.slice_from("abl");break;case 2:l()?W.slice_del():W.slice_from("iqU");break;case 3:l()&&W.slice_del()}break;case 8:if(!l())return!1;if(W.slice_del(),W.ket=W.cursor,W.eq_s_b(2,"at")&&(W.bra=W.cursor,l()&&(W.slice_del(),W.ket=W.cursor,W.eq_s_b(2,"ic")))){W.bra=W.cursor,l()?W.slice_del():W.slice_from("iqU");break}break;case 9:W.slice_from("eau");break;case 10:if(!a())return!1;W.slice_from("al");break;case 11:if(l())W.slice_del();else{if(!a())return!1;W.slice_from("eux")}break;case 12:if(!a()||!W.out_grouping_b(F,97,251))return!1;W.slice_del();break;case 13:return c()&&W.slice_from("ant"),!1;case 14:return c()&&W.slice_from("ent"),!1;case 15:return r=W.limit-W.cursor,W.in_grouping_b(F,97,251)&&c()&&(W.cursor=W.limit-r,W.slice_del()),!1}return!0}return!1}function f(){var e,r;if(W.cursor<q)return!1;if(r=W.limit_backward,W.limit_backward=q,W.ket=W.cursor,!(e=W.find_among_b(x,35)))return W.limit_backward=r,!1;if(W.bra=W.cursor,1==e){if(!W.out_grouping_b(F,97,251))return W.limit_backward=r,!1;W.slice_del()}return W.limit_backward=r,!0}function m(){var e,r,s;if(W.cursor<q)return!1;if(r=W.limit_backward,W.limit_backward=q,W.ket=W.cursor,!(e=W.find_among_b(I,38)))return W.limit_backward=r,!1;switch(W.bra=W.cursor,e){case 1:if(!l())return W.limit_backward=r,!1;W.slice_del();break;case 2:W.slice_del();break;case 3:W.slice_del(),s=W.limit-W.cursor,W.ket=W.cursor,W.eq_s_b(1,"e")?(W.bra=W.cursor,W.slice_del()):W.cursor=W.limit-s}return W.limit_backward=r,!0}function _(){var e,r,s,i,n=W.limit-W.cursor;if(W.ket=W.cursor,W.eq_s_b(1,"s")?(W.bra=W.cursor,r=W.limit-W.cursor,W.out_grouping_b(S,97,232)?(W.cursor=W.limit-r,W.slice_del()):W.cursor=W.limit-n):W.cursor=W.limit-n,W.cursor>=q){if(s=W.limit_backward,W.limit_backward=q,W.ket=W.cursor,e=W.find_among_b(P,7))switch(W.bra=W.cursor,e){case 1:if(l()){if(i=W.limit-W.cursor,!W.eq_s_b(1,"s")&&(W.cursor=W.limit-i,!W.eq_s_b(1,"t")))break;W.slice_del()}break;case 2:W.slice_from("i");break;case 3:W.slice_del();break;case 4:W.eq_s_b(2,"gu")&&W.slice_del()}W.limit_backward=s}}function b(){var e=W.limit-W.cursor;W.find_among_b(U,5)&&(W.cursor=W.limit-e,W.ket=W.cursor,W.cursor>W.limit_backward&&(W.cursor--,W.bra=W.cursor,W.slice_del()))}function d(){for(var e,r=1;W.out_grouping_b(F,97,251);)r--;if(r<=0){if(W.ket=W.cursor,e=W.limit-W.cursor,!W.eq_s_b(1,"é")&&(W.cursor=W.limit-e,!W.eq_s_b(1,"è")))return;W.bra=W.cursor,W.slice_from("e")}}function k(){if(!w()&&(W.cursor=W.limit,!f()&&(W.cursor=W.limit,!m())))return W.cursor=W.limit,void _();W.cursor=W.limit,W.ket=W.cursor,W.eq_s_b(1,"Y")?(W.bra=W.cursor,W.slice_from("i")):(W.cursor=W.limit,W.eq_s_b(1,"ç")&&(W.bra=W.cursor,W.slice_from("c")))}var p,g,q,v=[new r("col",-1,-1),new r("par",-1,-1),new r("tap",-1,-1)],h=[new r("",-1,4),new r("I",0,1),new r("U",0,2),new r("Y",0,3)],z=[new r("iqU",-1,3),new r("abl",-1,3),new r("Ièr",-1,4),new r("ièr",-1,4),new r("eus",-1,2),new r("iv",-1,1)],y=[new r("ic",-1,2),new r("abil",-1,1),new r("iv",-1,3)],C=[new r("iqUe",-1,1),new r("atrice",-1,2),new r("ance",-1,1),new r("ence",-1,5),new r("logie",-1,3),new r("able",-1,1),new r("isme",-1,1),new r("euse",-1,11),new r("iste",-1,1),new r("ive",-1,8),new r("if",-1,8),new r("usion",-1,4),new r("ation",-1,2),new r("ution",-1,4),new r("ateur",-1,2),new r("iqUes",-1,1),new r("atrices",-1,2),new r("ances",-1,1),new r("ences",-1,5),new r("logies",-1,3),new r("ables",-1,1),new r("ismes",-1,1),new r("euses",-1,11),new r("istes",-1,1),new r("ives",-1,8),new r("ifs",-1,8),new r("usions",-1,4),new r("ations",-1,2),new r("utions",-1,4),new r("ateurs",-1,2),new r("ments",-1,15),new r("ements",30,6),new r("issements",31,12),new r("ités",-1,7),new r("ment",-1,15),new r("ement",34,6),new r("issement",35,12),new r("amment",34,13),new r("emment",34,14),new r("aux",-1,10),new r("eaux",39,9),new r("eux",-1,1),new r("ité",-1,7)],x=[new r("ira",-1,1),new r("ie",-1,1),new r("isse",-1,1),new r("issante",-1,1),new r("i",-1,1),new r("irai",4,1),new r("ir",-1,1),new r("iras",-1,1),new r("ies",-1,1),new r("îmes",-1,1),new r("isses",-1,1),new r("issantes",-1,1),new r("îtes",-1,1),new r("is",-1,1),new r("irais",13,1),new r("issais",13,1),new r("irions",-1,1),new r("issions",-1,1),new r("irons",-1,1),new r("issons",-1,1),new r("issants",-1,1),new r("it",-1,1),new r("irait",21,1),new r("issait",21,1),new r("issant",-1,1),new r("iraIent",-1,1),new r("issaIent",-1,1),new r("irent",-1,1),new r("issent",-1,1),new r("iront",-1,1),new r("ît",-1,1),new r("iriez",-1,1),new r("issiez",-1,1),new r("irez",-1,1),new r("issez",-1,1)],I=[new r("a",-1,3),new r("era",0,2),new r("asse",-1,3),new r("ante",-1,3),new r("ée",-1,2),new r("ai",-1,3),new r("erai",5,2),new r("er",-1,2),new r("as",-1,3),new r("eras",8,2),new r("âmes",-1,3),new r("asses",-1,3),new r("antes",-1,3),new r("âtes",-1,3),new r("ées",-1,2),new r("ais",-1,3),new r("erais",15,2),new r("ions",-1,1),new r("erions",17,2),new r("assions",17,3),new r("erons",-1,2),new r("ants",-1,3),new r("és",-1,2),new r("ait",-1,3),new r("erait",23,2),new r("ant",-1,3),new r("aIent",-1,3),new r("eraIent",26,2),new r("èrent",-1,2),new r("assent",-1,3),new r("eront",-1,2),new r("ât",-1,3),new r("ez",-1,2),new r("iez",32,2),new r("eriez",33,2),new r("assiez",33,3),new r("erez",32,2),new r("é",-1,2)],P=[new r("e",-1,3),new r("Ière",0,2),new r("ière",0,2),new r("ion",-1,1),new r("Ier",-1,2),new r("ier",-1,2),new r("ë",-1,4)],U=[new r("ell",-1,-1),new r("eill",-1,-1),new r("enn",-1,-1),new r("onn",-1,-1),new r("ett",-1,-1)],F=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,128,130,103,8,5],S=[1,65,20,0,0,0,0,0,0,0,0,0,0,0,0,0,128],W=new s;this.setCurrent=function(e){W.setCurrent(e)},this.getCurrent=function(){return W.getCurrent()},this.stem=function(){var e=W.cursor;return n(),W.cursor=e,u(),W.limit_backward=e,W.cursor=W.limit,k(),W.cursor=W.limit,b(),W.cursor=W.limit,d(),W.cursor=W.limit_backward,o(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return i.setCurrent(e),i.stem(),i.getCurrent()}):(i.setCurrent(e),i.stem(),i.getCurrent())}}(),e.Pipeline.registerFunction(e.fr.stemmer,"stemmer-fr"),e.fr.stopWordFilter=e.generateStopWordFilter("ai aie aient aies ait as au aura aurai auraient aurais aurait auras aurez auriez aurions aurons auront aux avaient avais avait avec avez aviez avions avons ayant ayez ayons c ce ceci celà ces cet cette d dans de des du elle en es est et eu eue eues eurent eus eusse eussent eusses eussiez eussions eut eux eûmes eût eûtes furent fus fusse fussent fusses fussiez fussions fut fûmes fût fûtes ici il ils j je l la le les leur leurs lui m ma mais me mes moi mon même n ne nos notre nous on ont ou par pas pour qu que quel quelle quelles quels qui s sa sans se sera serai seraient serais serait seras serez seriez serions serons seront ses soi soient sois soit sommes son sont soyez soyons suis sur t ta te tes toi ton tu un une vos votre vous y à étaient étais était étant étiez étions été étée étées étés êtes".split(" ")),e.Pipeline.registerFunction(e.fr.stopWordFilter,"stopWordFilter-fr")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.he.min.js b/assets/javascripts/lunr/min/lunr.he.min.js
new file mode 100644
index 0000000..b863d3e
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.he.min.js
@@ -0,0 +1 @@
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.he=function(){this.pipeline.reset(),this.pipeline.add(e.he.trimmer,e.he.stopWordFilter,e.he.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.he.stemmer))},e.he.wordCharacters="֑-״א-תa-zA-Zａ-ｚＡ-Ｚ0-9０-９",e.he.trimmer=e.trimmerSupport.generateTrimmer(e.he.wordCharacters),e.Pipeline.registerFunction(e.he.trimmer,"trimmer-he"),e.he.stemmer=function(){var e=this;return e.result=!1,e.preRemoved=!1,e.sufRemoved=!1,e.pre={pre1:"ה ו י ת",pre2:"ב כ ל מ ש כש",pre3:"הב הכ הל המ הש בש לכ",pre4:"וב וכ ול ומ וש",pre5:"מה שה כל",pre6:"מב מכ מל ממ מש",pre7:"בה בו בי בת כה כו כי כת לה לו לי לת",pre8:"ובה ובו ובי ובת וכה וכו וכי וכת ולה ולו ולי ולת"},e.suf={suf1:"ך כ ם ן נ",suf2:"ים ות וך וכ ום ון ונ הם הן יכ יך ינ ים",suf3:"תי תך תכ תם תן תנ",suf4:"ותי ותך ותכ ותם ותן ותנ",suf5:"נו כם כן הם הן",suf6:"ונו וכם וכן והם והן",suf7:"תכם תכן תנו תהם תהן",suf8:"הוא היא הם הן אני אתה את אנו אתם אתן",suf9:"ני נו כי כו כם כן תי תך תכ תם תן",suf10:"י ך כ ם ן נ ת"},e.patterns=JSON.parse('{"hebrewPatterns": [{"pt1": [{"c": "ה", "l": 0}]}, {"pt2": [{"c": "ו", "l": 0}]}, {"pt3": [{"c": "י", "l": 0}]}, {"pt4": [{"c": "ת", "l": 0}]}, {"pt5": [{"c": "מ", "l": 0}]}, {"pt6": [{"c": "ל", "l": 0}]}, {"pt7": [{"c": "ב", "l": 0}]}, {"pt8": [{"c": "כ", "l": 0}]}, {"pt9": [{"c": "ש", "l": 0}]}, {"pt10": [{"c": "כש", "l": 0}]}, {"pt11": [{"c": "בה", "l": 0}]}, {"pt12": [{"c": "וב", "l": 0}]}, {"pt13": [{"c": "וכ", "l": 0}]}, {"pt14": [{"c": "ול", "l": 0}]}, {"pt15": [{"c": "ומ", "l": 0}]}, {"pt16": [{"c": "וש", "l": 0}]}, {"pt17": [{"c": "הב", "l": 0}]}, {"pt18": [{"c": "הכ", "l": 0}]}, {"pt19": [{"c": "הל", "l": 0}]}, {"pt20": [{"c": "המ", "l": 0}]}, {"pt21": [{"c": "הש", "l": 0}]}, {"pt22": [{"c": "מה", "l": 0}]}, {"pt23": [{"c": "שה", "l": 0}]}, {"pt24": [{"c": "כל", "l": 0}]}]}'),e.execArray=["cleanWord","removeDiacritics","removeStopWords","normalizeHebrewCharacters"],e.stem=function(){var r=0;for(e.result=!1,e.preRemoved=!1,e.sufRemoved=!1;r<e.execArray.length&&1!=e.result;)e.result=e[e.execArray[r]](),r++},e.setCurrent=function(r){e.word=r},e.getCurrent=function(){return e.word},e.cleanWord=function(){return!!new RegExp("[^֑-״א-ת]").test("")},e.removeDiacritics=function(){var r=new RegExp("[ְ-ֿ]","g");return e.word=e.word.replace(r,""),!1},e.removeStopWords=function(){if("אבל או אולי אותו אותי אותך אותם אותן אותנו אז אחר אחרות אחרי אחריכן אחרים אחרת אי איזה איך אין איפה אל אלה אלו אם אנחנו אני אף אפשר את אתה אתכם אתכן אתם אתן באיזה באיזו בגלל בין בלבד בעבור בעזרת בכל בכן בלי במידה במקום שבו ברוב בשביל בשעה ש בתוך גם דרך הוא היא היה היי היכן היתה היתי הם הן הנה הסיבה שבגללה הרי ואילו ואת זאת זה זות יהיה יוכל יוכלו יותר מדי יכול יכולה יכולות יכולים יכל יכלה יכלו יש כאן כאשר כולם כולן כזה כי כיצד כך כל כלל כמו כן כפי כש לא לאו לאיזותך לאן לבין לה להיות להם להן לו לזה לזות לי לך לכם לכן למה למעלה למעלה מ למטה למטה מ למעט למקום שבו למרות לנו לעבר לעיכן לפיכך לפני מאד מאחורי מאיזו סיבה מאין מאיפה מבלי מבעד מדוע מה מהיכן מול מחוץ מי מידע מכאן מכל מכן מלבד מן מנין מסוגל מעט מעטים מעל מצד מקום בו מתחת מתי נגד נגר נו עד עז על עלי עליו עליה עליהם עליך עלינו עם עצמה עצמהם עצמהן עצמו עצמי עצמם עצמן עצמנו פה רק שוב של שלה שלהם שלהן שלו שלי שלך שלכה שלכם שלכן שלנו שם תהיה תחת".split(" ").indexOf(e.word)>=0)return!0},e.normalizeHebrewCharacters=function(){return e.word=e.word.replace("ך","כ"),e.word=e.word.replace("ם","מ"),e.word=e.word.replace("ן","נ"),e.word=e.word.replace("ף","פ"),e.word=e.word.replace("ץ","צ"),!1},function(r){return"function"==typeof r.update?r.update(function(r){return e.setCurrent(r),e.stem(),e.getCurrent()}):(e.setCurrent(r),e.stem(),e.getCurrent())}}(),e.Pipeline.registerFunction(e.he.stemmer,"stemmer-he"),e.he.stopWordFilter=e.generateStopWordFilter("אבל או אולי אותו אותי אותך אותם אותן אותנו אז אחר אחרות אחרי אחריכן אחרים אחרת אי איזה איך אין איפה אל אלה אלו אם אנחנו אני אף אפשר את אתה אתכם אתכן אתם אתן באיזה באיזו בגלל בין בלבד בעבור בעזרת בכל בכן בלי במידה במקום שבו ברוב בשביל בשעה ש בתוך גם דרך הוא היא היה היי היכן היתה היתי הם הן הנה הסיבה שבגללה הרי ואילו ואת זאת זה זות יהיה יוכל יוכלו יותר מדי יכול יכולה יכולות יכולים יכל יכלה יכלו יש כאן כאשר כולם כולן כזה כי כיצד כך כל כלל כמו כן כפי כש לא לאו לאיזותך לאן לבין לה להיות להם להן לו לזה לזות לי לך לכם לכן למה למעלה למעלה מ למטה למטה מ למעט למקום שבו למרות לנו לעבר לעיכן לפיכך לפני מאד מאחורי מאיזו סיבה מאין מאיפה מבלי מבעד מדוע מה מהיכן מול מחוץ מי מידע מכאן מכל מכן מלבד מן מנין מסוגל מעט מעטים מעל מצד מקום בו מתחת מתי נגד נגר נו עד עז על עלי עליו עליה עליהם עליך עלינו עם עצמה עצמהם עצמהן עצמו עצמי עצמם עצמן עצמנו פה רק שוב של שלה שלהם שלהן שלו שלי שלך שלכה שלכם שלכן שלנו שם תהיה תחת".split(" ")),e.Pipeline.registerFunction(e.he.stopWordFilter,"stopWordFilter-he")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.hi.min.js b/assets/javascripts/lunr/min/lunr.hi.min.js
new file mode 100644
index 0000000..7dbc414
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.hi.min.js
@@ -0,0 +1 @@
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.hi=function(){this.pipeline.reset(),this.pipeline.add(e.hi.trimmer,e.hi.stopWordFilter,e.hi.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.hi.stemmer))},e.hi.wordCharacters="ऀ-ःऄ-एऐ-टठ-यर-िी-ॏॐ-य़ॠ-९॰-ॿa-zA-Zａ-ｚＡ-Ｚ0-9０-９",e.hi.trimmer=e.trimmerSupport.generateTrimmer(e.hi.wordCharacters),e.Pipeline.registerFunction(e.hi.trimmer,"trimmer-hi"),e.hi.stopWordFilter=e.generateStopWordFilter("अत अपना अपनी अपने अभी अंदर आदि आप इत्यादि इन इनका इन्हीं इन्हें इन्हों इस इसका इसकी इसके इसमें इसी इसे उन उनका उनकी उनके उनको उन्हीं उन्हें उन्हों उस उसके उसी उसे एक एवं एस ऐसे और कई कर करता करते करना करने करें कहते कहा का काफ़ी कि कितना किन्हें किन्हों किया किर किस किसी किसे की कुछ कुल के को कोई कौन कौनसा गया घर जब जहाँ जा जितना जिन जिन्हें जिन्हों जिस जिसे जीधर जैसा जैसे जो तक तब तरह तिन तिन्हें तिन्हों तिस तिसे तो था थी थे दबारा दिया दुसरा दूसरे दो द्वारा न नके नहीं ना निहायत नीचे ने पर पहले पूरा पे फिर बनी बही बहुत बाद बाला बिलकुल भी भीतर मगर मानो मे में यदि यह यहाँ यही या यिह ये रखें रहा रहे ऱ्वासा लिए लिये लेकिन व वग़ैरह वर्ग वह वहाँ वहीं वाले वुह वे वो सकता सकते सबसे सभी साथ साबुत साभ सारा से सो संग ही हुआ हुई हुए है हैं हो होता होती होते होना होने".split(" ")),e.hi.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}();var r=e.wordcut;r.init(),e.hi.tokenizer=function(i){if(!arguments.length||null==i||void 0==i)return[];if(Array.isArray(i))return i.map(function(r){return isLunr2?new e.Token(r.toLowerCase()):r.toLowerCase()});var t=i.toString().toLowerCase().replace(/^\s+/,"");return r.cut(t).split("|")},e.Pipeline.registerFunction(e.hi.stemmer,"stemmer-hi"),e.Pipeline.registerFunction(e.hi.stopWordFilter,"stopWordFilter-hi")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.hu.min.js b/assets/javascripts/lunr/min/lunr.hu.min.js
new file mode 100644
index 0000000..ed9d909
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.hu.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `Hungarian` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(e,n){"function"==typeof define&&define.amd?define(n):"object"==typeof exports?module.exports=n():n()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.hu=function(){this.pipeline.reset(),this.pipeline.add(e.hu.trimmer,e.hu.stopWordFilter,e.hu.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.hu.stemmer))},e.hu.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.hu.trimmer=e.trimmerSupport.generateTrimmer(e.hu.wordCharacters),e.Pipeline.registerFunction(e.hu.trimmer,"trimmer-hu"),e.hu.stemmer=function(){var n=e.stemmerSupport.Among,r=e.stemmerSupport.SnowballProgram,i=new function(){function e(){var e,n=L.cursor;if(d=L.limit,L.in_grouping(W,97,252))for(;;){if(e=L.cursor,L.out_grouping(W,97,252))return L.cursor=e,L.find_among(g,8)||(L.cursor=e,e<L.limit&&L.cursor++),void(d=L.cursor);if(L.cursor=e,e>=L.limit)return void(d=e);L.cursor++}if(L.cursor=n,L.out_grouping(W,97,252)){for(;!L.in_grouping(W,97,252);){if(L.cursor>=L.limit)return;L.cursor++}d=L.cursor}}function i(){return d<=L.cursor}function a(){var e;if(L.ket=L.cursor,(e=L.find_among_b(h,2))&&(L.bra=L.cursor,i()))switch(e){case 1:L.slice_from("a");break;case 2:L.slice_from("e")}}function t(){var e=L.limit-L.cursor;return!!L.find_among_b(p,23)&&(L.cursor=L.limit-e,!0)}function s(){if(L.cursor>L.limit_backward){L.cursor--,L.ket=L.cursor;var e=L.cursor-1;L.limit_backward<=e&&e<=L.limit&&(L.cursor=e,L.bra=e,L.slice_del())}}function c(){var e;if(L.ket=L.cursor,(e=L.find_among_b(_,2))&&(L.bra=L.cursor,i())){if((1==e||2==e)&&!t())return;L.slice_del(),s()}}function o(){L.ket=L.cursor,L.find_among_b(v,44)&&(L.bra=L.cursor,i()&&(L.slice_del(),a()))}function w(){var e;if(L.ket=L.cursor,(e=L.find_among_b(z,3))&&(L.bra=L.cursor,i()))switch(e){case 1:L.slice_from("e");break;case 2:case 3:L.slice_from("a")}}function l(){var e;if(L.ket=L.cursor,(e=L.find_among_b(y,6))&&(L.bra=L.cursor,i()))switch(e){case 1:case 2:L.slice_del();break;case 3:L.slice_from("a");break;case 4:L.slice_from("e")}}function u(){var e;if(L.ket=L.cursor,(e=L.find_among_b(j,2))&&(L.bra=L.cursor,i())){if((1==e||2==e)&&!t())return;L.slice_del(),s()}}function m(){var e;if(L.ket=L.cursor,(e=L.find_among_b(C,7))&&(L.bra=L.cursor,i()))switch(e){case 1:L.slice_from("a");break;case 2:L.slice_from("e");break;case 3:case 4:case 5:case 6:case 7:L.slice_del()}}function k(){var e;if(L.ket=L.cursor,(e=L.find_among_b(P,12))&&(L.bra=L.cursor,i()))switch(e){case 1:case 4:case 7:case 9:L.slice_del();break;case 2:case 5:case 8:L.slice_from("e");break;case 3:case 6:L.slice_from("a")}}function f(){var e;if(L.ket=L.cursor,(e=L.find_among_b(F,31))&&(L.bra=L.cursor,i()))switch(e){case 1:case 4:case 7:case 8:case 9:case 12:case 13:case 16:case 17:case 18:L.slice_del();break;case 2:case 5:case 10:case 14:case 19:L.slice_from("a");break;case 3:case 6:case 11:case 15:case 20:L.slice_from("e")}}function b(){var e;if(L.ket=L.cursor,(e=L.find_among_b(S,42))&&(L.bra=L.cursor,i()))switch(e){case 1:case 4:case 5:case 6:case 9:case 10:case 11:case 14:case 15:case 16:case 17:case 20:case 21:case 24:case 25:case 26:case 29:L.slice_del();break;case 2:case 7:case 12:case 18:case 22:case 27:L.slice_from("a");break;case 3:case 8:case 13:case 19:case 23:case 28:L.slice_from("e")}}var d,g=[new n("cs",-1,-1),new n("dzs",-1,-1),new n("gy",-1,-1),new n("ly",-1,-1),new n("ny",-1,-1),new n("sz",-1,-1),new n("ty",-1,-1),new n("zs",-1,-1)],h=[new n("á",-1,1),new n("é",-1,2)],p=[new n("bb",-1,-1),new n("cc",-1,-1),new n("dd",-1,-1),new n("ff",-1,-1),new n("gg",-1,-1),new n("jj",-1,-1),new n("kk",-1,-1),new n("ll",-1,-1),new n("mm",-1,-1),new n("nn",-1,-1),new n("pp",-1,-1),new n("rr",-1,-1),new n("ccs",-1,-1),new n("ss",-1,-1),new n("zzs",-1,-1),new n("tt",-1,-1),new n("vv",-1,-1),new n("ggy",-1,-1),new n("lly",-1,-1),new n("nny",-1,-1),new n("tty",-1,-1),new n("ssz",-1,-1),new n("zz",-1,-1)],_=[new n("al",-1,1),new n("el",-1,2)],v=[new n("ba",-1,-1),new n("ra",-1,-1),new n("be",-1,-1),new n("re",-1,-1),new n("ig",-1,-1),new n("nak",-1,-1),new n("nek",-1,-1),new n("val",-1,-1),new n("vel",-1,-1),new n("ul",-1,-1),new n("nál",-1,-1),new n("nél",-1,-1),new n("ból",-1,-1),new n("ról",-1,-1),new n("tól",-1,-1),new n("bõl",-1,-1),new n("rõl",-1,-1),new n("tõl",-1,-1),new n("ül",-1,-1),new n("n",-1,-1),new n("an",19,-1),new n("ban",20,-1),new n("en",19,-1),new n("ben",22,-1),new n("képpen",22,-1),new n("on",19,-1),new n("ön",19,-1),new n("képp",-1,-1),new n("kor",-1,-1),new n("t",-1,-1),new n("at",29,-1),new n("et",29,-1),new n("ként",29,-1),new n("anként",32,-1),new n("enként",32,-1),new n("onként",32,-1),new n("ot",29,-1),new n("ért",29,-1),new n("öt",29,-1),new n("hez",-1,-1),new n("hoz",-1,-1),new n("höz",-1,-1),new n("vá",-1,-1),new n("vé",-1,-1)],z=[new n("án",-1,2),new n("én",-1,1),new n("ánként",-1,3)],y=[new n("stul",-1,2),new n("astul",0,1),new n("ástul",0,3),new n("stül",-1,2),new n("estül",3,1),new n("éstül",3,4)],j=[new n("á",-1,1),new n("é",-1,2)],C=[new n("k",-1,7),new n("ak",0,4),new n("ek",0,6),new n("ok",0,5),new n("ák",0,1),new n("ék",0,2),new n("ök",0,3)],P=[new n("éi",-1,7),new n("áéi",0,6),new n("ééi",0,5),new n("é",-1,9),new n("ké",3,4),new n("aké",4,1),new n("eké",4,1),new n("oké",4,1),new n("áké",4,3),new n("éké",4,2),new n("öké",4,1),new n("éé",3,8)],F=[new n("a",-1,18),new n("ja",0,17),new n("d",-1,16),new n("ad",2,13),new n("ed",2,13),new n("od",2,13),new n("ád",2,14),new n("éd",2,15),new n("öd",2,13),new n("e",-1,18),new n("je",9,17),new n("nk",-1,4),new n("unk",11,1),new n("ánk",11,2),new n("énk",11,3),new n("ünk",11,1),new n("uk",-1,8),new n("juk",16,7),new n("ájuk",17,5),new n("ük",-1,8),new n("jük",19,7),new n("éjük",20,6),new n("m",-1,12),new n("am",22,9),new n("em",22,9),new n("om",22,9),new n("ám",22,10),new n("ém",22,11),new n("o",-1,18),new n("á",-1,19),new n("é",-1,20)],S=[new n("id",-1,10),new n("aid",0,9),new n("jaid",1,6),new n("eid",0,9),new n("jeid",3,6),new n("áid",0,7),new n("éid",0,8),new n("i",-1,15),new n("ai",7,14),new n("jai",8,11),new n("ei",7,14),new n("jei",10,11),new n("ái",7,12),new n("éi",7,13),new n("itek",-1,24),new n("eitek",14,21),new n("jeitek",15,20),new n("éitek",14,23),new n("ik",-1,29),new n("aik",18,26),new n("jaik",19,25),new n("eik",18,26),new n("jeik",21,25),new n("áik",18,27),new n("éik",18,28),new n("ink",-1,20),new n("aink",25,17),new n("jaink",26,16),new n("eink",25,17),new n("jeink",28,16),new n("áink",25,18),new n("éink",25,19),new n("aitok",-1,21),new n("jaitok",32,20),new n("áitok",-1,22),new n("im",-1,5),new n("aim",35,4),new n("jaim",36,1),new n("eim",35,4),new n("jeim",38,1),new n("áim",35,2),new n("éim",35,3)],W=[17,65,16,0,0,0,0,0,0,0,0,0,0,0,0,0,1,17,52,14],L=new r;this.setCurrent=function(e){L.setCurrent(e)},this.getCurrent=function(){return L.getCurrent()},this.stem=function(){var n=L.cursor;return e(),L.limit_backward=n,L.cursor=L.limit,c(),L.cursor=L.limit,o(),L.cursor=L.limit,w(),L.cursor=L.limit,l(),L.cursor=L.limit,u(),L.cursor=L.limit,k(),L.cursor=L.limit,f(),L.cursor=L.limit,b(),L.cursor=L.limit,m(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return i.setCurrent(e),i.stem(),i.getCurrent()}):(i.setCurrent(e),i.stem(),i.getCurrent())}}(),e.Pipeline.registerFunction(e.hu.stemmer,"stemmer-hu"),e.hu.stopWordFilter=e.generateStopWordFilter("a abban ahhoz ahogy ahol aki akik akkor alatt amely amelyek amelyekben amelyeket amelyet amelynek ami amikor amit amolyan amíg annak arra arról az azok azon azonban azt aztán azután azzal azért be belül benne bár cikk cikkek cikkeket csak de e ebben eddig egy egyes egyetlen egyik egyre egyéb egész ehhez ekkor el ellen elsõ elég elõ elõször elõtt emilyen ennek erre ez ezek ezen ezt ezzel ezért fel felé hanem hiszen hogy hogyan igen ill ill. illetve ilyen ilyenkor ismét ison itt jobban jó jól kell kellett keressünk keresztül ki kívül között közül legalább legyen lehet lehetett lenne lenni lesz lett maga magát majd majd meg mellett mely melyek mert mi mikor milyen minden mindenki mindent mindig mint mintha mit mivel miért most már más másik még míg nagy nagyobb nagyon ne nekem neki nem nincs néha néhány nélkül olyan ott pedig persze rá s saját sem semmi sok sokat sokkal szemben szerint szinte számára talán tehát teljes tovább továbbá több ugyanis utolsó után utána vagy vagyis vagyok valaki valami valamint való van vannak vele vissza viszont volna volt voltak voltam voltunk által általában át én éppen és így õ õk õket össze úgy új újabb újra".split(" ")),e.Pipeline.registerFunction(e.hu.stopWordFilter,"stopWordFilter-hu")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.hy.min.js b/assets/javascripts/lunr/min/lunr.hy.min.js
new file mode 100644
index 0000000..b37f792
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.hy.min.js
@@ -0,0 +1 @@
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.hy=function(){this.pipeline.reset(),this.pipeline.add(e.hy.trimmer,e.hy.stopWordFilter)},e.hy.wordCharacters="[A-Za-z԰-֏ﬀ-ﭏ]",e.hy.trimmer=e.trimmerSupport.generateTrimmer(e.hy.wordCharacters),e.Pipeline.registerFunction(e.hy.trimmer,"trimmer-hy"),e.hy.stopWordFilter=e.generateStopWordFilter("դու և եք էիր էիք հետո նաև նրանք որը վրա է որ պիտի են այս մեջ ն իր ու ի այդ որոնք այն կամ էր մի ես համար այլ իսկ էին ենք հետ ին թ էինք մենք նրա նա դուք եմ էի ըստ որպես ում".split(" ")),e.Pipeline.registerFunction(e.hy.stopWordFilter,"stopWordFilter-hy"),e.hy.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}(),e.Pipeline.registerFunction(e.hy.stemmer,"stemmer-hy")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.it.min.js b/assets/javascripts/lunr/min/lunr.it.min.js
new file mode 100644
index 0000000..344b6a3
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.it.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `Italian` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.it=function(){this.pipeline.reset(),this.pipeline.add(e.it.trimmer,e.it.stopWordFilter,e.it.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.it.stemmer))},e.it.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.it.trimmer=e.trimmerSupport.generateTrimmer(e.it.wordCharacters),e.Pipeline.registerFunction(e.it.trimmer,"trimmer-it"),e.it.stemmer=function(){var r=e.stemmerSupport.Among,n=e.stemmerSupport.SnowballProgram,i=new function(){function e(e,r,n){return!(!x.eq_s(1,e)||(x.ket=x.cursor,!x.in_grouping(L,97,249)))&&(x.slice_from(r),x.cursor=n,!0)}function i(){for(var r,n,i,o,t=x.cursor;;){if(x.bra=x.cursor,r=x.find_among(h,7))switch(x.ket=x.cursor,r){case 1:x.slice_from("à");continue;case 2:x.slice_from("è");continue;case 3:x.slice_from("ì");continue;case 4:x.slice_from("ò");continue;case 5:x.slice_from("ù");continue;case 6:x.slice_from("qU");continue;case 7:if(x.cursor>=x.limit)break;x.cursor++;continue}break}for(x.cursor=t;;)for(n=x.cursor;;){if(i=x.cursor,x.in_grouping(L,97,249)){if(x.bra=x.cursor,o=x.cursor,e("u","U",i))break;if(x.cursor=o,e("i","I",i))break}if(x.cursor=i,x.cursor>=x.limit)return void(x.cursor=n);x.cursor++}}function o(e){if(x.cursor=e,!x.in_grouping(L,97,249))return!1;for(;!x.out_grouping(L,97,249);){if(x.cursor>=x.limit)return!1;x.cursor++}return!0}function t(){if(x.in_grouping(L,97,249)){var e=x.cursor;if(x.out_grouping(L,97,249)){for(;!x.in_grouping(L,97,249);){if(x.cursor>=x.limit)return o(e);x.cursor++}return!0}return o(e)}return!1}function s(){var e,r=x.cursor;if(!t()){if(x.cursor=r,!x.out_grouping(L,97,249))return;if(e=x.cursor,x.out_grouping(L,97,249)){for(;!x.in_grouping(L,97,249);){if(x.cursor>=x.limit)return x.cursor=e,void(x.in_grouping(L,97,249)&&x.cursor<x.limit&&x.cursor++);x.cursor++}return void(k=x.cursor)}if(x.cursor=e,!x.in_grouping(L,97,249)||x.cursor>=x.limit)return;x.cursor++}k=x.cursor}function a(){for(;!x.in_grouping(L,97,249);){if(x.cursor>=x.limit)return!1;x.cursor++}for(;!x.out_grouping(L,97,249);){if(x.cursor>=x.limit)return!1;x.cursor++}return!0}function u(){var e=x.cursor;k=x.limit,p=k,g=k,s(),x.cursor=e,a()&&(p=x.cursor,a()&&(g=x.cursor))}function c(){for(var e;;){if(x.bra=x.cursor,!(e=x.find_among(q,3)))break;switch(x.ket=x.cursor,e){case 1:x.slice_from("i");break;case 2:x.slice_from("u");break;case 3:if(x.cursor>=x.limit)return;x.cursor++}}}function w(){return k<=x.cursor}function l(){return p<=x.cursor}function m(){return g<=x.cursor}function f(){var e;if(x.ket=x.cursor,x.find_among_b(C,37)&&(x.bra=x.cursor,(e=x.find_among_b(z,5))&&w()))switch(e){case 1:x.slice_del();break;case 2:x.slice_from("e")}}function v(){var e;if(x.ket=x.cursor,!(e=x.find_among_b(S,51)))return!1;switch(x.bra=x.cursor,e){case 1:if(!m())return!1;x.slice_del();break;case 2:if(!m())return!1;x.slice_del(),x.ket=x.cursor,x.eq_s_b(2,"ic")&&(x.bra=x.cursor,m()&&x.slice_del());break;case 3:if(!m())return!1;x.slice_from("log");break;case 4:if(!m())return!1;x.slice_from("u");break;case 5:if(!m())return!1;x.slice_from("ente");break;case 6:if(!w())return!1;x.slice_del();break;case 7:if(!l())return!1;x.slice_del(),x.ket=x.cursor,e=x.find_among_b(P,4),e&&(x.bra=x.cursor,m()&&(x.slice_del(),1==e&&(x.ket=x.cursor,x.eq_s_b(2,"at")&&(x.bra=x.cursor,m()&&x.slice_del()))));break;case 8:if(!m())return!1;x.slice_del(),x.ket=x.cursor,e=x.find_among_b(F,3),e&&(x.bra=x.cursor,1==e&&m()&&x.slice_del());break;case 9:if(!m())return!1;x.slice_del(),x.ket=x.cursor,x.eq_s_b(2,"at")&&(x.bra=x.cursor,m()&&(x.slice_del(),x.ket=x.cursor,x.eq_s_b(2,"ic")&&(x.bra=x.cursor,m()&&x.slice_del())))}return!0}function b(){var e,r;x.cursor>=k&&(r=x.limit_backward,x.limit_backward=k,x.ket=x.cursor,e=x.find_among_b(W,87),e&&(x.bra=x.cursor,1==e&&x.slice_del()),x.limit_backward=r)}function d(){var e=x.limit-x.cursor;if(x.ket=x.cursor,x.in_grouping_b(y,97,242)&&(x.bra=x.cursor,w()&&(x.slice_del(),x.ket=x.cursor,x.eq_s_b(1,"i")&&(x.bra=x.cursor,w()))))return void x.slice_del();x.cursor=x.limit-e}function _(){d(),x.ket=x.cursor,x.eq_s_b(1,"h")&&(x.bra=x.cursor,x.in_grouping_b(U,99,103)&&w()&&x.slice_del())}var g,p,k,h=[new r("",-1,7),new r("qu",0,6),new r("á",0,1),new r("é",0,2),new r("í",0,3),new r("ó",0,4),new r("ú",0,5)],q=[new r("",-1,3),new r("I",0,1),new r("U",0,2)],C=[new r("la",-1,-1),new r("cela",0,-1),new r("gliela",0,-1),new r("mela",0,-1),new r("tela",0,-1),new r("vela",0,-1),new r("le",-1,-1),new r("cele",6,-1),new r("gliele",6,-1),new r("mele",6,-1),new r("tele",6,-1),new r("vele",6,-1),new r("ne",-1,-1),new r("cene",12,-1),new r("gliene",12,-1),new r("mene",12,-1),new r("sene",12,-1),new r("tene",12,-1),new r("vene",12,-1),new r("ci",-1,-1),new r("li",-1,-1),new r("celi",20,-1),new r("glieli",20,-1),new r("meli",20,-1),new r("teli",20,-1),new r("veli",20,-1),new r("gli",20,-1),new r("mi",-1,-1),new r("si",-1,-1),new r("ti",-1,-1),new r("vi",-1,-1),new r("lo",-1,-1),new r("celo",31,-1),new r("glielo",31,-1),new r("melo",31,-1),new r("telo",31,-1),new r("velo",31,-1)],z=[new r("ando",-1,1),new r("endo",-1,1),new r("ar",-1,2),new r("er",-1,2),new r("ir",-1,2)],P=[new r("ic",-1,-1),new r("abil",-1,-1),new r("os",-1,-1),new r("iv",-1,1)],F=[new r("ic",-1,1),new r("abil",-1,1),new r("iv",-1,1)],S=[new r("ica",-1,1),new r("logia",-1,3),new r("osa",-1,1),new r("ista",-1,1),new r("iva",-1,9),new r("anza",-1,1),new r("enza",-1,5),new r("ice",-1,1),new r("atrice",7,1),new r("iche",-1,1),new r("logie",-1,3),new r("abile",-1,1),new r("ibile",-1,1),new r("usione",-1,4),new r("azione",-1,2),new r("uzione",-1,4),new r("atore",-1,2),new r("ose",-1,1),new r("ante",-1,1),new r("mente",-1,1),new r("amente",19,7),new r("iste",-1,1),new r("ive",-1,9),new r("anze",-1,1),new r("enze",-1,5),new r("ici",-1,1),new r("atrici",25,1),new r("ichi",-1,1),new r("abili",-1,1),new r("ibili",-1,1),new r("ismi",-1,1),new r("usioni",-1,4),new r("azioni",-1,2),new r("uzioni",-1,4),new r("atori",-1,2),new r("osi",-1,1),new r("anti",-1,1),new r("amenti",-1,6),new r("imenti",-1,6),new r("isti",-1,1),new r("ivi",-1,9),new r("ico",-1,1),new r("ismo",-1,1),new r("oso",-1,1),new r("amento",-1,6),new r("imento",-1,6),new r("ivo",-1,9),new r("ità",-1,8),new r("istà",-1,1),new r("istè",-1,1),new r("istì",-1,1)],W=[new r("isca",-1,1),new r("enda",-1,1),new r("ata",-1,1),new r("ita",-1,1),new r("uta",-1,1),new r("ava",-1,1),new r("eva",-1,1),new r("iva",-1,1),new r("erebbe",-1,1),new r("irebbe",-1,1),new r("isce",-1,1),new r("ende",-1,1),new r("are",-1,1),new r("ere",-1,1),new r("ire",-1,1),new r("asse",-1,1),new r("ate",-1,1),new r("avate",16,1),new r("evate",16,1),new r("ivate",16,1),new r("ete",-1,1),new r("erete",20,1),new r("irete",20,1),new r("ite",-1,1),new r("ereste",-1,1),new r("ireste",-1,1),new r("ute",-1,1),new r("erai",-1,1),new r("irai",-1,1),new r("isci",-1,1),new r("endi",-1,1),new r("erei",-1,1),new r("irei",-1,1),new r("assi",-1,1),new r("ati",-1,1),new r("iti",-1,1),new r("eresti",-1,1),new r("iresti",-1,1),new r("uti",-1,1),new r("avi",-1,1),new r("evi",-1,1),new r("ivi",-1,1),new r("isco",-1,1),new r("ando",-1,1),new r("endo",-1,1),new r("Yamo",-1,1),new r("iamo",-1,1),new r("avamo",-1,1),new r("evamo",-1,1),new r("ivamo",-1,1),new r("eremo",-1,1),new r("iremo",-1,1),new r("assimo",-1,1),new r("ammo",-1,1),new r("emmo",-1,1),new r("eremmo",54,1),new r("iremmo",54,1),new r("immo",-1,1),new r("ano",-1,1),new r("iscano",58,1),new r("avano",58,1),new r("evano",58,1),new r("ivano",58,1),new r("eranno",-1,1),new r("iranno",-1,1),new r("ono",-1,1),new r("iscono",65,1),new r("arono",65,1),new r("erono",65,1),new r("irono",65,1),new r("erebbero",-1,1),new r("irebbero",-1,1),new r("assero",-1,1),new r("essero",-1,1),new r("issero",-1,1),new r("ato",-1,1),new r("ito",-1,1),new r("uto",-1,1),new r("avo",-1,1),new r("evo",-1,1),new r("ivo",-1,1),new r("ar",-1,1),new r("ir",-1,1),new r("erà",-1,1),new r("irà",-1,1),new r("erò",-1,1),new r("irò",-1,1)],L=[17,65,16,0,0,0,0,0,0,0,0,0,0,0,0,128,128,8,2,1],y=[17,65,0,0,0,0,0,0,0,0,0,0,0,0,0,128,128,8,2],U=[17],x=new n;this.setCurrent=function(e){x.setCurrent(e)},this.getCurrent=function(){return x.getCurrent()},this.stem=function(){var e=x.cursor;return i(),x.cursor=e,u(),x.limit_backward=e,x.cursor=x.limit,f(),x.cursor=x.limit,v()||(x.cursor=x.limit,b()),x.cursor=x.limit,_(),x.cursor=x.limit_backward,c(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return i.setCurrent(e),i.stem(),i.getCurrent()}):(i.setCurrent(e),i.stem(),i.getCurrent())}}(),e.Pipeline.registerFunction(e.it.stemmer,"stemmer-it"),e.it.stopWordFilter=e.generateStopWordFilter("a abbia abbiamo abbiano abbiate ad agl agli ai al all alla alle allo anche avemmo avendo avesse avessero avessi avessimo aveste avesti avete aveva avevamo avevano avevate avevi avevo avrai avranno avrebbe avrebbero avrei avremmo avremo avreste avresti avrete avrà avrò avuta avute avuti avuto c che chi ci coi col come con contro cui da dagl dagli dai dal dall dalla dalle dallo degl degli dei del dell della delle dello di dov dove e ebbe ebbero ebbi ed era erano eravamo eravate eri ero essendo faccia facciamo facciano facciate faccio facemmo facendo facesse facessero facessi facessimo faceste facesti faceva facevamo facevano facevate facevi facevo fai fanno farai faranno farebbe farebbero farei faremmo faremo fareste faresti farete farà farò fece fecero feci fosse fossero fossi fossimo foste fosti fu fui fummo furono gli ha hai hanno ho i il in io l la le lei li lo loro lui ma mi mia mie miei mio ne negl negli nei nel nell nella nelle nello noi non nostra nostre nostri nostro o per perché più quale quanta quante quanti quanto quella quelle quelli quello questa queste questi questo sarai saranno sarebbe sarebbero sarei saremmo saremo sareste saresti sarete sarà sarò se sei si sia siamo siano siate siete sono sta stai stando stanno starai staranno starebbe starebbero starei staremmo staremo stareste staresti starete starà starò stava stavamo stavano stavate stavi stavo stemmo stesse stessero stessi stessimo steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua sue sugl sugli sui sul sull sulla sulle sullo suo suoi ti tra tu tua tue tuo tuoi tutti tutto un una uno vi voi vostra vostre vostri vostro è".split(" ")),e.Pipeline.registerFunction(e.it.stopWordFilter,"stopWordFilter-it")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.ja.min.js b/assets/javascripts/lunr/min/lunr.ja.min.js
new file mode 100644
index 0000000..5f254eb
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.ja.min.js
@@ -0,0 +1 @@
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");var r="2"==e.version[0];e.ja=function(){this.pipeline.reset(),this.pipeline.add(e.ja.trimmer,e.ja.stopWordFilter,e.ja.stemmer),r?this.tokenizer=e.ja.tokenizer:(e.tokenizer&&(e.tokenizer=e.ja.tokenizer),this.tokenizerFn&&(this.tokenizerFn=e.ja.tokenizer))};var t=new e.TinySegmenter;e.ja.tokenizer=function(i){var n,o,s,p,a,u,m,l,c,f;if(!arguments.length||null==i||void 0==i)return[];if(Array.isArray(i))return i.map(function(t){return r?new e.Token(t.toLowerCase()):t.toLowerCase()});for(o=i.toString().toLowerCase().replace(/^\s+/,""),n=o.length-1;n>=0;n--)if(/\S/.test(o.charAt(n))){o=o.substring(0,n+1);break}for(a=[],s=o.length,c=0,l=0;c<=s;c++)if(u=o.charAt(c),m=c-l,u.match(/\s/)||c==s){if(m>0)for(p=t.segment(o.slice(l,c)).filter(function(e){return!!e}),f=l,n=0;n<p.length;n++)r?a.push(new e.Token(p[n],{position:[f,p[n].length],index:a.length})):a.push(p[n]),f+=p[n].length;l=c+1}return a},e.ja.stemmer=function(){return function(e){return e}}(),e.Pipeline.registerFunction(e.ja.stemmer,"stemmer-ja"),e.ja.wordCharacters="一二三四五六七八九十百千万億兆一-龠々〆ヵヶぁ-んァ-ヴーｱ-ﾝﾞa-zA-Zａ-ｚＡ-Ｚ0-9０-９",e.ja.trimmer=e.trimmerSupport.generateTrimmer(e.ja.wordCharacters),e.Pipeline.registerFunction(e.ja.trimmer,"trimmer-ja"),e.ja.stopWordFilter=e.generateStopWordFilter("これ それ あれ この その あの ここ そこ あそこ こちら どこ だれ なに なん 何 私 貴方 貴方方 我々 私達 あの人 あのかた 彼女 彼 です あります おります います は が の に を で え から まで より も どの と し それで しかし".split(" ")),e.Pipeline.registerFunction(e.ja.stopWordFilter,"stopWordFilter-ja"),e.jp=e.ja,e.Pipeline.registerFunction(e.jp.stemmer,"stemmer-jp"),e.Pipeline.registerFunction(e.jp.trimmer,"trimmer-jp"),e.Pipeline.registerFunction(e.jp.stopWordFilter,"stopWordFilter-jp")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.jp.min.js b/assets/javascripts/lunr/min/lunr.jp.min.js
new file mode 100644
index 0000000..c055eba
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.jp.min.js
@@ -0,0 +1 @@
+module.exports=require("./lunr.ja");
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.kn.min.js b/assets/javascripts/lunr/min/lunr.kn.min.js
new file mode 100644
index 0000000..1cef9be
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.kn.min.js
@@ -0,0 +1 @@
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.kn=function(){this.pipeline.reset(),this.pipeline.add(e.kn.trimmer,e.kn.stopWordFilter,e.kn.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.kn.stemmer))},e.kn.wordCharacters="ಀ-಄ಅ-ಔಕ-ಹಾ-ೌ಼-ಽೕ-ೖೝ-ೞೠ-ೡೢ-ೣ೤೥೦-೯ೱ-ೳ",e.kn.trimmer=e.trimmerSupport.generateTrimmer(e.kn.wordCharacters),e.Pipeline.registerFunction(e.kn.trimmer,"trimmer-kn"),e.kn.stopWordFilter=e.generateStopWordFilter("ಮತ್ತು ಈ ಒಂದು ರಲ್ಲಿ ಹಾಗೂ ಎಂದು ಅಥವಾ ಇದು ರ ಅವರು ಎಂಬ ಮೇಲೆ ಅವರ ತನ್ನ ಆದರೆ ತಮ್ಮ ನಂತರ ಮೂಲಕ ಹೆಚ್ಚು ನ ಆ ಕೆಲವು ಅನೇಕ ಎರಡು ಹಾಗು ಪ್ರಮುಖ ಇದನ್ನು ಇದರ ಸುಮಾರು ಅದರ ಅದು ಮೊದಲ ಬಗ್ಗೆ ನಲ್ಲಿ ರಂದು ಇತರ ಅತ್ಯಂತ ಹೆಚ್ಚಿನ ಸಹ ಸಾಮಾನ್ಯವಾಗಿ ನೇ ಹಲವಾರು ಹೊಸ ದಿ ಕಡಿಮೆ ಯಾವುದೇ ಹೊಂದಿದೆ ದೊಡ್ಡ ಅನ್ನು ಇವರು ಪ್ರಕಾರ ಇದೆ ಮಾತ್ರ ಕೂಡ ಇಲ್ಲಿ ಎಲ್ಲಾ ವಿವಿಧ ಅದನ್ನು ಹಲವು ರಿಂದ ಕೇವಲ ದ ದಕ್ಷಿಣ ಗೆ ಅವನ ಅತಿ ನೆಯ ಬಹಳ ಕೆಲಸ ಎಲ್ಲ ಪ್ರತಿ ಇತ್ಯಾದಿ ಇವು ಬೇರೆ ಹೀಗೆ ನಡುವೆ ಇದಕ್ಕೆ ಎಸ್ ಇವರ ಮೊದಲು ಶ್ರೀ ಮಾಡುವ ಇದರಲ್ಲಿ ರೀತಿಯ ಮಾಡಿದ ಕಾಲ ಅಲ್ಲಿ ಮಾಡಲು ಅದೇ ಈಗ ಅವು ಗಳು ಎ ಎಂಬುದು ಅವನು ಅಂದರೆ ಅವರಿಗೆ ಇರುವ ವಿಶೇಷ ಮುಂದೆ ಅವುಗಳ ಮುಂತಾದ ಮೂಲ ಬಿ ಮೀ ಒಂದೇ ಇನ್ನೂ ಹೆಚ್ಚಾಗಿ ಮಾಡಿ ಅವರನ್ನು ಇದೇ ಯ ರೀತಿಯಲ್ಲಿ ಜೊತೆ ಅದರಲ್ಲಿ ಮಾಡಿದರು ನಡೆದ ಆಗ ಮತ್ತೆ ಪೂರ್ವ ಆತ ಬಂದ ಯಾವ ಒಟ್ಟು ಇತರೆ ಹಿಂದೆ ಪ್ರಮಾಣದ ಗಳನ್ನು ಕುರಿತು ಯು ಆದ್ದರಿಂದ ಅಲ್ಲದೆ ನಗರದ ಮೇಲಿನ ಏಕೆಂದರೆ ರಷ್ಟು ಎಂಬುದನ್ನು ಬಾರಿ ಎಂದರೆ ಹಿಂದಿನ ಆದರೂ ಆದ ಸಂಬಂಧಿಸಿದ ಮತ್ತೊಂದು ಸಿ ಆತನ ".split(" ")),e.kn.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}();var r=e.wordcut;r.init(),e.kn.tokenizer=function(t){if(!arguments.length||null==t||void 0==t)return[];if(Array.isArray(t))return t.map(function(r){return isLunr2?new e.Token(r.toLowerCase()):r.toLowerCase()});var n=t.toString().toLowerCase().replace(/^\s+/,"");return r.cut(n).split("|")},e.Pipeline.registerFunction(e.kn.stemmer,"stemmer-kn"),e.Pipeline.registerFunction(e.kn.stopWordFilter,"stopWordFilter-kn")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.ko.min.js b/assets/javascripts/lunr/min/lunr.ko.min.js
new file mode 100644
index 0000000..eaf9dab
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.ko.min.js
@@ -0,0 +1 @@
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.ko=function(){this.pipeline.reset(),this.pipeline.add(e.ko.trimmer,e.ko.stopWordFilter)},e.ko.wordCharacters="[A-Za-z가-힣]",e.ko.trimmer=e.trimmerSupport.generateTrimmer(e.ko.wordCharacters),e.Pipeline.registerFunction(e.ko.trimmer,"trimmer-ko"),e.ko.stopWordFilter=e.generateStopWordFilter("아 휴 아이구 아이쿠 아이고 어 나 우리 저희 따라 의해 을 를 에 의 가 으로 로 에게 뿐이다 의거하여 근거하여 입각하여 기준으로 예하면 예를 들면 예를 들자면 저 소인 소생 저희 지말고 하지마 하지마라 다른 물론 또한 그리고 비길수 없다 해서는 안된다 뿐만 아니라 만이 아니다 만은 아니다 막론하고 관계없이 그치지 않다 그러나 그런데 하지만 든간에 논하지 않다 따지지 않다 설사 비록 더라도 아니면 만 못하다 하는 편이 낫다 불문하고 향하여 향해서 향하다 쪽으로 틈타 이용하여 타다 오르다 제외하고 이 외에 이 밖에 하여야 비로소 한다면 몰라도 외에도 이곳 여기 부터 기점으로 따라서 할 생각이다 하려고하다 이리하여 그리하여 그렇게 함으로써 하지만 일때 할때 앞에서 중에서 보는데서 으로써 로써 까지 해야한다 일것이다 반드시 할줄알다 할수있다 할수있어 임에 틀림없다 한다면 등 등등 제 겨우 단지 다만 할뿐 딩동 댕그 대해서 대하여 대하면 훨씬 얼마나 얼마만큼 얼마큼 남짓 여 얼마간 약간 다소 좀 조금 다수 몇 얼마 지만 하물며 또한 그러나 그렇지만 하지만 이외에도 대해 말하자면 뿐이다 다음에 반대로 반대로 말하자면 이와 반대로 바꾸어서 말하면 바꾸어서 한다면 만약 그렇지않으면 까악 툭 딱 삐걱거리다 보드득 비걱거리다 꽈당 응당 해야한다 에 가서 각 각각 여러분 각종 각자 제각기 하도록하다 와 과 그러므로 그래서 고로 한 까닭에 하기 때문에 거니와 이지만 대하여 관하여 관한 과연 실로 아니나다를가 생각한대로 진짜로 한적이있다 하곤하였다 하 하하 허허 아하 거바 와 오 왜 어째서 무엇때문에 어찌 하겠는가 무슨 어디 어느곳 더군다나 하물며 더욱이는 어느때 언제 야 이봐 어이 여보시오 흐흐 흥 휴 헉헉 헐떡헐떡 영차 여차 어기여차 끙끙 아야 앗 아야 콸콸 졸졸 좍좍 뚝뚝 주룩주룩 솨 우르르 그래도 또 그리고 바꾸어말하면 바꾸어말하자면 혹은 혹시 답다 및 그에 따르는 때가 되어 즉 지든지 설령 가령 하더라도 할지라도 일지라도 지든지 몇 거의 하마터면 인젠 이젠 된바에야 된이상 만큼\t어찌됏든 그위에 게다가 점에서 보아 비추어 보아 고려하면 하게될것이다 일것이다 비교적 좀 보다더 비하면 시키다 하게하다 할만하다 의해서 연이서 이어서 잇따라 뒤따라 뒤이어 결국 의지하여 기대여 통하여 자마자 더욱더 불구하고 얼마든지 마음대로 주저하지 않고 곧 즉시 바로 당장 하자마자 밖에 안된다 하면된다 그래 그렇지 요컨대 다시 말하자면 바꿔 말하면 즉 구체적으로 말하자면 시작하여 시초에 이상 허 헉 허걱 바와같이 해도좋다 해도된다 게다가 더구나 하물며 와르르 팍 퍽 펄렁 동안 이래 하고있었다 이었다 에서 로부터 까지 예하면 했어요 해요 함께 같이 더불어 마저 마저도 양자 모두 습니다 가까스로 하려고하다 즈음하여 다른 다른 방면으로 해봐요 습니까 했어요 말할것도 없고 무릎쓰고 개의치않고 하는것만 못하다 하는것이 낫다 매 매번 들 모 어느것 어느 로써 갖고말하자면 어디 어느쪽 어느것 어느해 어느 년도 라 해도 언젠가 어떤것 어느것 저기 저쪽 저것 그때 그럼 그러면 요만한걸 그래 그때 저것만큼 그저 이르기까지 할 줄 안다 할 힘이 있다 너 너희 당신 어찌 설마 차라리 할지언정 할지라도 할망정 할지언정 구토하다 게우다 토하다 메쓰겁다 옆사람 퉤 쳇 의거하여 근거하여 의해 따라 힘입어 그 다음 버금 두번째로 기타 첫번째로 나머지는 그중에서 견지에서 형식으로 쓰여 입장에서 위해서 단지 의해되다 하도록시키다 뿐만아니라 반대로 전후 전자 앞의것 잠시 잠깐 하면서 그렇지만 다음에 그러한즉 그런즉 남들 아무거나 어찌하든지 같다 비슷하다 예컨대 이럴정도로 어떻게 만약 만일 위에서 서술한바와같이 인 듯하다 하지 않는다면 만약에 무엇 무슨 어느 어떤 아래윗 조차 한데 그럼에도 불구하고 여전히 심지어 까지도 조차도 하지 않도록 않기 위하여 때 시각 무렵 시간 동안 어때 어떠한 하여금 네 예 우선 누구 누가 알겠는가 아무도 줄은모른다 줄은 몰랏다 하는 김에 겸사겸사 하는바 그런 까닭에 한 이유는 그러니 그러니까 때문에 그 너희 그들 너희들 타인 것 것들 너 위하여 공동으로 동시에 하기 위하여 어찌하여 무엇때문에 붕붕 윙윙 나 우리 엉엉 휘익 윙윙 오호 아하 어쨋든 만 못하다\t하기보다는 차라리 하는 편이 낫다 흐흐 놀라다 상대적으로 말하자면 마치 아니라면 쉿 그렇지 않으면 그렇지 않다면 안 그러면 아니었다면 하든지 아니면 이라면 좋아 알았어 하는것도 그만이다 어쩔수 없다 하나 일 일반적으로 일단 한켠으로는 오자마자 이렇게되면 이와같다면 전부 한마디 한항목 근거로 하기에 아울러 하지 않도록 않기 위해서 이르기까지 이 되다 로 인하여 까닭으로 이유만으로 이로 인하여 그래서 이 때문에 그러므로 그런 까닭에 알 수 있다 결론을 낼 수 있다 으로 인하여 있다 어떤것 관계가 있다 관련이 있다 연관되다 어떤것들 에 대해 이리하여 그리하여 여부 하기보다는 하느니 하면 할수록 운운 이러이러하다 하구나 하도다 다시말하면 다음으로 에 있다 에 달려 있다 우리 우리들 오히려 하기는한데 어떻게 어떻해 어찌됏어 어때 어째서 본대로 자 이 이쪽 여기 이것 이번 이렇게말하자면 이런 이러한 이와 같은 요만큼 요만한 것 얼마 안 되는 것 이만큼 이 정도의 이렇게 많은 것 이와 같다 이때 이렇구나 것과 같이 끼익 삐걱 따위 와 같은 사람들 부류의 사람들 왜냐하면 중의하나 오직 오로지 에 한하다 하기만 하면 도착하다 까지 미치다 도달하다 정도에 이르다 할 지경이다 결과에 이르다 관해서는 여러분 하고 있다 한 후 혼자 자기 자기집 자신 우에 종합한것과같이 총적으로 보면 총적으로 말하면 총적으로 대로 하다 으로서 참 그만이다 할 따름이다 쿵 탕탕 쾅쾅 둥둥 봐 봐라 아이야 아니 와아 응 아이 참나 년 월 일 령 영 일 이 삼 사 오 육 륙 칠 팔 구 이천육 이천칠 이천팔 이천구 하나 둘 셋 넷 다섯 여섯 일곱 여덟 아홉 령 영".split(" ")),e.Pipeline.registerFunction(e.ko.stopWordFilter,"stopWordFilter-ko"),e.ko.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}(),e.Pipeline.registerFunction(e.ko.stemmer,"stemmer-ko")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.multi.min.js b/assets/javascripts/lunr/min/lunr.multi.min.js
new file mode 100644
index 0000000..7debad0
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.multi.min.js
@@ -0,0 +1 @@
+!function(e,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(e.lunr)}(this,function(){return function(e){e.multiLanguage=function(){for(var t=Array.prototype.slice.call(arguments),i=t.join("-"),r="",n=[],s=[],p=0;p<t.length;++p)"en"==t[p]?(r+="\\w",n.unshift(e.stopWordFilter),n.push(e.stemmer),s.push(e.stemmer)):(r+=e[t[p]].wordCharacters,e[t[p]].stopWordFilter&&n.unshift(e[t[p]].stopWordFilter),e[t[p]].stemmer&&(n.push(e[t[p]].stemmer),s.push(e[t[p]].stemmer)));var o=e.trimmerSupport.generateTrimmer(r);return e.Pipeline.registerFunction(o,"lunr-multi-trimmer-"+i),n.unshift(o),function(){this.pipeline.reset(),this.pipeline.add.apply(this.pipeline,n),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add.apply(this.searchPipeline,s))}}}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.nl.min.js b/assets/javascripts/lunr/min/lunr.nl.min.js
new file mode 100644
index 0000000..c4a2535
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.nl.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `Dutch` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(r,e){"function"==typeof define&&define.amd?define(e):"object"==typeof exports?module.exports=e():e()(r.lunr)}(this,function(){return function(r){if(void 0===r)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===r.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");r.nl=function(){this.pipeline.reset(),this.pipeline.add(r.nl.trimmer,r.nl.stopWordFilter,r.nl.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(r.nl.stemmer))},r.nl.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",r.nl.trimmer=r.trimmerSupport.generateTrimmer(r.nl.wordCharacters),r.Pipeline.registerFunction(r.nl.trimmer,"trimmer-nl"),r.nl.stemmer=function(){var e=r.stemmerSupport.Among,i=r.stemmerSupport.SnowballProgram,n=new function(){function r(){for(var r,e,i,o=C.cursor;;){if(C.bra=C.cursor,r=C.find_among(b,11))switch(C.ket=C.cursor,r){case 1:C.slice_from("a");continue;case 2:C.slice_from("e");continue;case 3:C.slice_from("i");continue;case 4:C.slice_from("o");continue;case 5:C.slice_from("u");continue;case 6:if(C.cursor>=C.limit)break;C.cursor++;continue}break}for(C.cursor=o,C.bra=o,C.eq_s(1,"y")?(C.ket=C.cursor,C.slice_from("Y")):C.cursor=o;;)if(e=C.cursor,C.in_grouping(q,97,232)){if(i=C.cursor,C.bra=i,C.eq_s(1,"i"))C.ket=C.cursor,C.in_grouping(q,97,232)&&(C.slice_from("I"),C.cursor=e);else if(C.cursor=i,C.eq_s(1,"y"))C.ket=C.cursor,C.slice_from("Y"),C.cursor=e;else if(n(e))break}else if(n(e))break}function n(r){return C.cursor=r,r>=C.limit||(C.cursor++,!1)}function o(){_=C.limit,d=_,t()||(_=C.cursor,_<3&&(_=3),t()||(d=C.cursor))}function t(){for(;!C.in_grouping(q,97,232);){if(C.cursor>=C.limit)return!0;C.cursor++}for(;!C.out_grouping(q,97,232);){if(C.cursor>=C.limit)return!0;C.cursor++}return!1}function s(){for(var r;;)if(C.bra=C.cursor,r=C.find_among(p,3))switch(C.ket=C.cursor,r){case 1:C.slice_from("y");break;case 2:C.slice_from("i");break;case 3:if(C.cursor>=C.limit)return;C.cursor++}}function u(){return _<=C.cursor}function c(){return d<=C.cursor}function a(){var r=C.limit-C.cursor;C.find_among_b(g,3)&&(C.cursor=C.limit-r,C.ket=C.cursor,C.cursor>C.limit_backward&&(C.cursor--,C.bra=C.cursor,C.slice_del()))}function l(){var r;w=!1,C.ket=C.cursor,C.eq_s_b(1,"e")&&(C.bra=C.cursor,u()&&(r=C.limit-C.cursor,C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-r,C.slice_del(),w=!0,a())))}function m(){var r;u()&&(r=C.limit-C.cursor,C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-r,C.eq_s_b(3,"gem")||(C.cursor=C.limit-r,C.slice_del(),a())))}function f(){var r,e,i,n,o,t,s=C.limit-C.cursor;if(C.ket=C.cursor,r=C.find_among_b(h,5))switch(C.bra=C.cursor,r){case 1:u()&&C.slice_from("heid");break;case 2:m();break;case 3:u()&&C.out_grouping_b(j,97,232)&&C.slice_del()}if(C.cursor=C.limit-s,l(),C.cursor=C.limit-s,C.ket=C.cursor,C.eq_s_b(4,"heid")&&(C.bra=C.cursor,c()&&(e=C.limit-C.cursor,C.eq_s_b(1,"c")||(C.cursor=C.limit-e,C.slice_del(),C.ket=C.cursor,C.eq_s_b(2,"en")&&(C.bra=C.cursor,m())))),C.cursor=C.limit-s,C.ket=C.cursor,r=C.find_among_b(k,6))switch(C.bra=C.cursor,r){case 1:if(c()){if(C.slice_del(),i=C.limit-C.cursor,C.ket=C.cursor,C.eq_s_b(2,"ig")&&(C.bra=C.cursor,c()&&(n=C.limit-C.cursor,!C.eq_s_b(1,"e")))){C.cursor=C.limit-n,C.slice_del();break}C.cursor=C.limit-i,a()}break;case 2:c()&&(o=C.limit-C.cursor,C.eq_s_b(1,"e")||(C.cursor=C.limit-o,C.slice_del()));break;case 3:c()&&(C.slice_del(),l());break;case 4:c()&&C.slice_del();break;case 5:c()&&w&&C.slice_del()}C.cursor=C.limit-s,C.out_grouping_b(z,73,232)&&(t=C.limit-C.cursor,C.find_among_b(v,4)&&C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-t,C.ket=C.cursor,C.cursor>C.limit_backward&&(C.cursor--,C.bra=C.cursor,C.slice_del())))}var d,_,w,b=[new e("",-1,6),new e("á",0,1),new e("ä",0,1),new e("é",0,2),new e("ë",0,2),new e("í",0,3),new e("ï",0,3),new e("ó",0,4),new e("ö",0,4),new e("ú",0,5),new e("ü",0,5)],p=[new e("",-1,3),new e("I",0,2),new e("Y",0,1)],g=[new e("dd",-1,-1),new e("kk",-1,-1),new e("tt",-1,-1)],h=[new e("ene",-1,2),new e("se",-1,3),new e("en",-1,2),new e("heden",2,1),new e("s",-1,3)],k=[new e("end",-1,1),new e("ig",-1,2),new e("ing",-1,1),new e("lijk",-1,3),new e("baar",-1,4),new e("bar",-1,5)],v=[new e("aa",-1,-1),new e("ee",-1,-1),new e("oo",-1,-1),new e("uu",-1,-1)],q=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],z=[1,0,0,17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],j=[17,67,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],C=new i;this.setCurrent=function(r){C.setCurrent(r)},this.getCurrent=function(){return C.getCurrent()},this.stem=function(){var e=C.cursor;return r(),C.cursor=e,o(),C.limit_backward=e,C.cursor=C.limit,f(),C.cursor=C.limit_backward,s(),!0}};return function(r){return"function"==typeof r.update?r.update(function(r){return n.setCurrent(r),n.stem(),n.getCurrent()}):(n.setCurrent(r),n.stem(),n.getCurrent())}}(),r.Pipeline.registerFunction(r.nl.stemmer,"stemmer-nl"),r.nl.stopWordFilter=r.generateStopWordFilter(" aan al alles als altijd andere ben bij daar dan dat de der deze die dit doch doen door dus een eens en er ge geen geweest haar had heb hebben heeft hem het hier hij hoe hun iemand iets ik in is ja je kan kon kunnen maar me meer men met mij mijn moet na naar niet niets nog nu of om omdat onder ons ook op over reeds te tegen toch toen tot u uit uw van veel voor want waren was wat werd wezen wie wil worden wordt zal ze zelf zich zij zijn zo zonder zou".split(" ")),r.Pipeline.registerFunction(r.nl.stopWordFilter,"stopWordFilter-nl")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.no.min.js b/assets/javascripts/lunr/min/lunr.no.min.js
new file mode 100644
index 0000000..92bc7e4
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.no.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `Norwegian` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.no=function(){this.pipeline.reset(),this.pipeline.add(e.no.trimmer,e.no.stopWordFilter,e.no.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.no.stemmer))},e.no.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.no.trimmer=e.trimmerSupport.generateTrimmer(e.no.wordCharacters),e.Pipeline.registerFunction(e.no.trimmer,"trimmer-no"),e.no.stemmer=function(){var r=e.stemmerSupport.Among,n=e.stemmerSupport.SnowballProgram,i=new function(){function e(){var e,r=w.cursor+3;if(a=w.limit,0<=r||r<=w.limit){for(s=r;;){if(e=w.cursor,w.in_grouping(d,97,248)){w.cursor=e;break}if(e>=w.limit)return;w.cursor=e+1}for(;!w.out_grouping(d,97,248);){if(w.cursor>=w.limit)return;w.cursor++}a=w.cursor,a<s&&(a=s)}}function i(){var e,r,n;if(w.cursor>=a&&(r=w.limit_backward,w.limit_backward=a,w.ket=w.cursor,e=w.find_among_b(m,29),w.limit_backward=r,e))switch(w.bra=w.cursor,e){case 1:w.slice_del();break;case 2:n=w.limit-w.cursor,w.in_grouping_b(c,98,122)?w.slice_del():(w.cursor=w.limit-n,w.eq_s_b(1,"k")&&w.out_grouping_b(d,97,248)&&w.slice_del());break;case 3:w.slice_from("er")}}function t(){var e,r=w.limit-w.cursor;w.cursor>=a&&(e=w.limit_backward,w.limit_backward=a,w.ket=w.cursor,w.find_among_b(u,2)?(w.bra=w.cursor,w.limit_backward=e,w.cursor=w.limit-r,w.cursor>w.limit_backward&&(w.cursor--,w.bra=w.cursor,w.slice_del())):w.limit_backward=e)}function o(){var e,r;w.cursor>=a&&(r=w.limit_backward,w.limit_backward=a,w.ket=w.cursor,e=w.find_among_b(l,11),e?(w.bra=w.cursor,w.limit_backward=r,1==e&&w.slice_del()):w.limit_backward=r)}var s,a,m=[new r("a",-1,1),new r("e",-1,1),new r("ede",1,1),new r("ande",1,1),new r("ende",1,1),new r("ane",1,1),new r("ene",1,1),new r("hetene",6,1),new r("erte",1,3),new r("en",-1,1),new r("heten",9,1),new r("ar",-1,1),new r("er",-1,1),new r("heter",12,1),new r("s",-1,2),new r("as",14,1),new r("es",14,1),new r("edes",16,1),new r("endes",16,1),new r("enes",16,1),new r("hetenes",19,1),new r("ens",14,1),new r("hetens",21,1),new r("ers",14,1),new r("ets",14,1),new r("et",-1,1),new r("het",25,1),new r("ert",-1,3),new r("ast",-1,1)],u=[new r("dt",-1,-1),new r("vt",-1,-1)],l=[new r("leg",-1,1),new r("eleg",0,1),new r("ig",-1,1),new r("eig",2,1),new r("lig",2,1),new r("elig",4,1),new r("els",-1,1),new r("lov",-1,1),new r("elov",7,1),new r("slov",7,1),new r("hetslov",9,1)],d=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,48,0,128],c=[119,125,149,1],w=new n;this.setCurrent=function(e){w.setCurrent(e)},this.getCurrent=function(){return w.getCurrent()},this.stem=function(){var r=w.cursor;return e(),w.limit_backward=r,w.cursor=w.limit,i(),w.cursor=w.limit,t(),w.cursor=w.limit,o(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return i.setCurrent(e),i.stem(),i.getCurrent()}):(i.setCurrent(e),i.stem(),i.getCurrent())}}(),e.Pipeline.registerFunction(e.no.stemmer,"stemmer-no"),e.no.stopWordFilter=e.generateStopWordFilter("alle at av bare begge ble blei bli blir blitt både båe da de deg dei deim deira deires dem den denne der dere deres det dette di din disse ditt du dykk dykkar då eg ein eit eitt eller elles en enn er et ett etter for fordi fra før ha hadde han hans har hennar henne hennes her hjå ho hoe honom hoss hossen hun hva hvem hver hvilke hvilken hvis hvor hvordan hvorfor i ikke ikkje ikkje ingen ingi inkje inn inni ja jeg kan kom korleis korso kun kunne kva kvar kvarhelst kven kvi kvifor man mange me med medan meg meget mellom men mi min mine mitt mot mykje ned no noe noen noka noko nokon nokor nokre nå når og også om opp oss over på samme seg selv si si sia sidan siden sin sine sitt sjøl skal skulle slik so som som somme somt så sånn til um upp ut uten var vart varte ved vere verte vi vil ville vore vors vort vår være være vært å".split(" ")),e.Pipeline.registerFunction(e.no.stopWordFilter,"stopWordFilter-no")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.pt.min.js b/assets/javascripts/lunr/min/lunr.pt.min.js
new file mode 100644
index 0000000..6c16996
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.pt.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `Portuguese` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.pt=function(){this.pipeline.reset(),this.pipeline.add(e.pt.trimmer,e.pt.stopWordFilter,e.pt.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.pt.stemmer))},e.pt.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.pt.trimmer=e.trimmerSupport.generateTrimmer(e.pt.wordCharacters),e.Pipeline.registerFunction(e.pt.trimmer,"trimmer-pt"),e.pt.stemmer=function(){var r=e.stemmerSupport.Among,s=e.stemmerSupport.SnowballProgram,n=new function(){function e(){for(var e;;){if(z.bra=z.cursor,e=z.find_among(k,3))switch(z.ket=z.cursor,e){case 1:z.slice_from("a~");continue;case 2:z.slice_from("o~");continue;case 3:if(z.cursor>=z.limit)break;z.cursor++;continue}break}}function n(){if(z.out_grouping(y,97,250)){for(;!z.in_grouping(y,97,250);){if(z.cursor>=z.limit)return!0;z.cursor++}return!1}return!0}function i(){if(z.in_grouping(y,97,250))for(;!z.out_grouping(y,97,250);){if(z.cursor>=z.limit)return!1;z.cursor++}return g=z.cursor,!0}function o(){var e,r,s=z.cursor;if(z.in_grouping(y,97,250))if(e=z.cursor,n()){if(z.cursor=e,i())return}else g=z.cursor;if(z.cursor=s,z.out_grouping(y,97,250)){if(r=z.cursor,n()){if(z.cursor=r,!z.in_grouping(y,97,250)||z.cursor>=z.limit)return;z.cursor++}g=z.cursor}}function t(){for(;!z.in_grouping(y,97,250);){if(z.cursor>=z.limit)return!1;z.cursor++}for(;!z.out_grouping(y,97,250);){if(z.cursor>=z.limit)return!1;z.cursor++}return!0}function a(){var e=z.cursor;g=z.limit,b=g,h=g,o(),z.cursor=e,t()&&(b=z.cursor,t()&&(h=z.cursor))}function u(){for(var e;;){if(z.bra=z.cursor,e=z.find_among(q,3))switch(z.ket=z.cursor,e){case 1:z.slice_from("ã");continue;case 2:z.slice_from("õ");continue;case 3:if(z.cursor>=z.limit)break;z.cursor++;continue}break}}function w(){return g<=z.cursor}function m(){return b<=z.cursor}function c(){return h<=z.cursor}function l(){var e;if(z.ket=z.cursor,!(e=z.find_among_b(F,45)))return!1;switch(z.bra=z.cursor,e){case 1:if(!c())return!1;z.slice_del();break;case 2:if(!c())return!1;z.slice_from("log");break;case 3:if(!c())return!1;z.slice_from("u");break;case 4:if(!c())return!1;z.slice_from("ente");break;case 5:if(!m())return!1;z.slice_del(),z.ket=z.cursor,e=z.find_among_b(j,4),e&&(z.bra=z.cursor,c()&&(z.slice_del(),1==e&&(z.ket=z.cursor,z.eq_s_b(2,"at")&&(z.bra=z.cursor,c()&&z.slice_del()))));break;case 6:if(!c())return!1;z.slice_del(),z.ket=z.cursor,e=z.find_among_b(C,3),e&&(z.bra=z.cursor,1==e&&c()&&z.slice_del());break;case 7:if(!c())return!1;z.slice_del(),z.ket=z.cursor,e=z.find_among_b(P,3),e&&(z.bra=z.cursor,1==e&&c()&&z.slice_del());break;case 8:if(!c())return!1;z.slice_del(),z.ket=z.cursor,z.eq_s_b(2,"at")&&(z.bra=z.cursor,c()&&z.slice_del());break;case 9:if(!w()||!z.eq_s_b(1,"e"))return!1;z.slice_from("ir")}return!0}function f(){var e,r;if(z.cursor>=g){if(r=z.limit_backward,z.limit_backward=g,z.ket=z.cursor,e=z.find_among_b(S,120))return z.bra=z.cursor,1==e&&z.slice_del(),z.limit_backward=r,!0;z.limit_backward=r}return!1}function d(){var e;z.ket=z.cursor,(e=z.find_among_b(W,7))&&(z.bra=z.cursor,1==e&&w()&&z.slice_del())}function v(e,r){if(z.eq_s_b(1,e)){z.bra=z.cursor;var s=z.limit-z.cursor;if(z.eq_s_b(1,r))return z.cursor=z.limit-s,w()&&z.slice_del(),!1}return!0}function p(){var e;if(z.ket=z.cursor,e=z.find_among_b(L,4))switch(z.bra=z.cursor,e){case 1:w()&&(z.slice_del(),z.ket=z.cursor,z.limit-z.cursor,v("u","g")&&v("i","c"));break;case 2:z.slice_from("c")}}function _(){if(!l()&&(z.cursor=z.limit,!f()))return z.cursor=z.limit,void d();z.cursor=z.limit,z.ket=z.cursor,z.eq_s_b(1,"i")&&(z.bra=z.cursor,z.eq_s_b(1,"c")&&(z.cursor=z.limit,w()&&z.slice_del()))}var h,b,g,k=[new r("",-1,3),new r("ã",0,1),new r("õ",0,2)],q=[new r("",-1,3),new r("a~",0,1),new r("o~",0,2)],j=[new r("ic",-1,-1),new r("ad",-1,-1),new r("os",-1,-1),new r("iv",-1,1)],C=[new r("ante",-1,1),new r("avel",-1,1),new r("ível",-1,1)],P=[new r("ic",-1,1),new r("abil",-1,1),new r("iv",-1,1)],F=[new r("ica",-1,1),new r("ância",-1,1),new r("ência",-1,4),new r("ira",-1,9),new r("adora",-1,1),new r("osa",-1,1),new r("ista",-1,1),new r("iva",-1,8),new r("eza",-1,1),new r("logía",-1,2),new r("idade",-1,7),new r("ante",-1,1),new r("mente",-1,6),new r("amente",12,5),new r("ável",-1,1),new r("ível",-1,1),new r("ución",-1,3),new r("ico",-1,1),new r("ismo",-1,1),new r("oso",-1,1),new r("amento",-1,1),new r("imento",-1,1),new r("ivo",-1,8),new r("aça~o",-1,1),new r("ador",-1,1),new r("icas",-1,1),new r("ências",-1,4),new r("iras",-1,9),new r("adoras",-1,1),new r("osas",-1,1),new r("istas",-1,1),new r("ivas",-1,8),new r("ezas",-1,1),new r("logías",-1,2),new r("idades",-1,7),new r("uciones",-1,3),new r("adores",-1,1),new r("antes",-1,1),new r("aço~es",-1,1),new r("icos",-1,1),new r("ismos",-1,1),new r("osos",-1,1),new r("amentos",-1,1),new r("imentos",-1,1),new r("ivos",-1,8)],S=[new r("ada",-1,1),new r("ida",-1,1),new r("ia",-1,1),new r("aria",2,1),new r("eria",2,1),new r("iria",2,1),new r("ara",-1,1),new r("era",-1,1),new r("ira",-1,1),new r("ava",-1,1),new r("asse",-1,1),new r("esse",-1,1),new r("isse",-1,1),new r("aste",-1,1),new r("este",-1,1),new r("iste",-1,1),new r("ei",-1,1),new r("arei",16,1),new r("erei",16,1),new r("irei",16,1),new r("am",-1,1),new r("iam",20,1),new r("ariam",21,1),new r("eriam",21,1),new r("iriam",21,1),new r("aram",20,1),new r("eram",20,1),new r("iram",20,1),new r("avam",20,1),new r("em",-1,1),new r("arem",29,1),new r("erem",29,1),new r("irem",29,1),new r("assem",29,1),new r("essem",29,1),new r("issem",29,1),new r("ado",-1,1),new r("ido",-1,1),new r("ando",-1,1),new r("endo",-1,1),new r("indo",-1,1),new r("ara~o",-1,1),new r("era~o",-1,1),new r("ira~o",-1,1),new r("ar",-1,1),new r("er",-1,1),new r("ir",-1,1),new r("as",-1,1),new r("adas",47,1),new r("idas",47,1),new r("ias",47,1),new r("arias",50,1),new r("erias",50,1),new r("irias",50,1),new r("aras",47,1),new r("eras",47,1),new r("iras",47,1),new r("avas",47,1),new r("es",-1,1),new r("ardes",58,1),new r("erdes",58,1),new r("irdes",58,1),new r("ares",58,1),new r("eres",58,1),new r("ires",58,1),new r("asses",58,1),new r("esses",58,1),new r("isses",58,1),new r("astes",58,1),new r("estes",58,1),new r("istes",58,1),new r("is",-1,1),new r("ais",71,1),new r("eis",71,1),new r("areis",73,1),new r("ereis",73,1),new r("ireis",73,1),new r("áreis",73,1),new r("éreis",73,1),new r("íreis",73,1),new r("ásseis",73,1),new r("ésseis",73,1),new r("ísseis",73,1),new r("áveis",73,1),new r("íeis",73,1),new r("aríeis",84,1),new r("eríeis",84,1),new r("iríeis",84,1),new r("ados",-1,1),new r("idos",-1,1),new r("amos",-1,1),new r("áramos",90,1),new r("éramos",90,1),new r("íramos",90,1),new r("ávamos",90,1),new r("íamos",90,1),new r("aríamos",95,1),new r("eríamos",95,1),new r("iríamos",95,1),new r("emos",-1,1),new r("aremos",99,1),new r("eremos",99,1),new r("iremos",99,1),new r("ássemos",99,1),new r("êssemos",99,1),new r("íssemos",99,1),new r("imos",-1,1),new r("armos",-1,1),new r("ermos",-1,1),new r("irmos",-1,1),new r("ámos",-1,1),new r("arás",-1,1),new r("erás",-1,1),new r("irás",-1,1),new r("eu",-1,1),new r("iu",-1,1),new r("ou",-1,1),new r("ará",-1,1),new r("erá",-1,1),new r("irá",-1,1)],W=[new r("a",-1,1),new r("i",-1,1),new r("o",-1,1),new r("os",-1,1),new r("á",-1,1),new r("í",-1,1),new r("ó",-1,1)],L=[new r("e",-1,1),new r("ç",-1,2),new r("é",-1,1),new r("ê",-1,1)],y=[17,65,16,0,0,0,0,0,0,0,0,0,0,0,0,0,3,19,12,2],z=new s;this.setCurrent=function(e){z.setCurrent(e)},this.getCurrent=function(){return z.getCurrent()},this.stem=function(){var r=z.cursor;return e(),z.cursor=r,a(),z.limit_backward=r,z.cursor=z.limit,_(),z.cursor=z.limit,p(),z.cursor=z.limit_backward,u(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return n.setCurrent(e),n.stem(),n.getCurrent()}):(n.setCurrent(e),n.stem(),n.getCurrent())}}(),e.Pipeline.registerFunction(e.pt.stemmer,"stemmer-pt"),e.pt.stopWordFilter=e.generateStopWordFilter("a ao aos aquela aquelas aquele aqueles aquilo as até com como da das de dela delas dele deles depois do dos e ela elas ele eles em entre era eram essa essas esse esses esta estamos estas estava estavam este esteja estejam estejamos estes esteve estive estivemos estiver estivera estiveram estiverem estivermos estivesse estivessem estivéramos estivéssemos estou está estávamos estão eu foi fomos for fora foram forem formos fosse fossem fui fôramos fôssemos haja hajam hajamos havemos hei houve houvemos houver houvera houveram houverei houverem houveremos houveria houveriam houvermos houverá houverão houveríamos houvesse houvessem houvéramos houvéssemos há hão isso isto já lhe lhes mais mas me mesmo meu meus minha minhas muito na nas nem no nos nossa nossas nosso nossos num numa não nós o os ou para pela pelas pelo pelos por qual quando que quem se seja sejam sejamos sem serei seremos seria seriam será serão seríamos seu seus somos sou sua suas são só também te tem temos tenha tenham tenhamos tenho terei teremos teria teriam terá terão teríamos teu teus teve tinha tinham tive tivemos tiver tivera tiveram tiverem tivermos tivesse tivessem tivéramos tivéssemos tu tua tuas tém tínhamos um uma você vocês vos à às éramos".split(" ")),e.Pipeline.registerFunction(e.pt.stopWordFilter,"stopWordFilter-pt")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.ro.min.js b/assets/javascripts/lunr/min/lunr.ro.min.js
new file mode 100644
index 0000000..7277140
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.ro.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `Romanian` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(e,i){"function"==typeof define&&define.amd?define(i):"object"==typeof exports?module.exports=i():i()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.ro=function(){this.pipeline.reset(),this.pipeline.add(e.ro.trimmer,e.ro.stopWordFilter,e.ro.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.ro.stemmer))},e.ro.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.ro.trimmer=e.trimmerSupport.generateTrimmer(e.ro.wordCharacters),e.Pipeline.registerFunction(e.ro.trimmer,"trimmer-ro"),e.ro.stemmer=function(){var i=e.stemmerSupport.Among,r=e.stemmerSupport.SnowballProgram,n=new function(){function e(e,i){L.eq_s(1,e)&&(L.ket=L.cursor,L.in_grouping(W,97,259)&&L.slice_from(i))}function n(){for(var i,r;;){if(i=L.cursor,L.in_grouping(W,97,259)&&(r=L.cursor,L.bra=r,e("u","U"),L.cursor=r,e("i","I")),L.cursor=i,L.cursor>=L.limit)break;L.cursor++}}function t(){if(L.out_grouping(W,97,259)){for(;!L.in_grouping(W,97,259);){if(L.cursor>=L.limit)return!0;L.cursor++}return!1}return!0}function a(){if(L.in_grouping(W,97,259))for(;!L.out_grouping(W,97,259);){if(L.cursor>=L.limit)return!0;L.cursor++}return!1}function o(){var e,i,r=L.cursor;if(L.in_grouping(W,97,259)){if(e=L.cursor,!t())return void(h=L.cursor);if(L.cursor=e,!a())return void(h=L.cursor)}L.cursor=r,L.out_grouping(W,97,259)&&(i=L.cursor,t()&&(L.cursor=i,L.in_grouping(W,97,259)&&L.cursor<L.limit&&L.cursor++),h=L.cursor)}function u(){for(;!L.in_grouping(W,97,259);){if(L.cursor>=L.limit)return!1;L.cursor++}for(;!L.out_grouping(W,97,259);){if(L.cursor>=L.limit)return!1;L.cursor++}return!0}function c(){var e=L.cursor;h=L.limit,k=h,g=h,o(),L.cursor=e,u()&&(k=L.cursor,u()&&(g=L.cursor))}function s(){for(var e;;){if(L.bra=L.cursor,e=L.find_among(z,3))switch(L.ket=L.cursor,e){case 1:L.slice_from("i");continue;case 2:L.slice_from("u");continue;case 3:if(L.cursor>=L.limit)break;L.cursor++;continue}break}}function w(){return h<=L.cursor}function m(){return k<=L.cursor}function l(){return g<=L.cursor}function f(){var e,i;if(L.ket=L.cursor,(e=L.find_among_b(C,16))&&(L.bra=L.cursor,m()))switch(e){case 1:L.slice_del();break;case 2:L.slice_from("a");break;case 3:L.slice_from("e");break;case 4:L.slice_from("i");break;case 5:i=L.limit-L.cursor,L.eq_s_b(2,"ab")||(L.cursor=L.limit-i,L.slice_from("i"));break;case 6:L.slice_from("at");break;case 7:L.slice_from("aţi")}}function p(){var e,i=L.limit-L.cursor;if(L.ket=L.cursor,(e=L.find_among_b(P,46))&&(L.bra=L.cursor,m())){switch(e){case 1:L.slice_from("abil");break;case 2:L.slice_from("ibil");break;case 3:L.slice_from("iv");break;case 4:L.slice_from("ic");break;case 5:L.slice_from("at");break;case 6:L.slice_from("it")}return _=!0,L.cursor=L.limit-i,!0}return!1}function d(){var e,i;for(_=!1;;)if(i=L.limit-L.cursor,!p()){L.cursor=L.limit-i;break}if(L.ket=L.cursor,(e=L.find_among_b(F,62))&&(L.bra=L.cursor,l())){switch(e){case 1:L.slice_del();break;case 2:L.eq_s_b(1,"ţ")&&(L.bra=L.cursor,L.slice_from("t"));break;case 3:L.slice_from("ist")}_=!0}}function b(){var e,i,r;if(L.cursor>=h){if(i=L.limit_backward,L.limit_backward=h,L.ket=L.cursor,e=L.find_among_b(q,94))switch(L.bra=L.cursor,e){case 1:if(r=L.limit-L.cursor,!L.out_grouping_b(W,97,259)&&(L.cursor=L.limit-r,!L.eq_s_b(1,"u")))break;case 2:L.slice_del()}L.limit_backward=i}}function v(){var e;L.ket=L.cursor,(e=L.find_among_b(S,5))&&(L.bra=L.cursor,w()&&1==e&&L.slice_del())}var _,g,k,h,z=[new i("",-1,3),new i("I",0,1),new i("U",0,2)],C=[new i("ea",-1,3),new i("aţia",-1,7),new i("aua",-1,2),new i("iua",-1,4),new i("aţie",-1,7),new i("ele",-1,3),new i("ile",-1,5),new i("iile",6,4),new i("iei",-1,4),new i("atei",-1,6),new i("ii",-1,4),new i("ului",-1,1),new i("ul",-1,1),new i("elor",-1,3),new i("ilor",-1,4),new i("iilor",14,4)],P=[new i("icala",-1,4),new i("iciva",-1,4),new i("ativa",-1,5),new i("itiva",-1,6),new i("icale",-1,4),new i("aţiune",-1,5),new i("iţiune",-1,6),new i("atoare",-1,5),new i("itoare",-1,6),new i("ătoare",-1,5),new i("icitate",-1,4),new i("abilitate",-1,1),new i("ibilitate",-1,2),new i("ivitate",-1,3),new i("icive",-1,4),new i("ative",-1,5),new i("itive",-1,6),new i("icali",-1,4),new i("atori",-1,5),new i("icatori",18,4),new i("itori",-1,6),new i("ători",-1,5),new i("icitati",-1,4),new i("abilitati",-1,1),new i("ivitati",-1,3),new i("icivi",-1,4),new i("ativi",-1,5),new i("itivi",-1,6),new i("icităi",-1,4),new i("abilităi",-1,1),new i("ivităi",-1,3),new i("icităţi",-1,4),new i("abilităţi",-1,1),new i("ivităţi",-1,3),new i("ical",-1,4),new i("ator",-1,5),new i("icator",35,4),new i("itor",-1,6),new i("ător",-1,5),new i("iciv",-1,4),new i("ativ",-1,5),new i("itiv",-1,6),new i("icală",-1,4),new i("icivă",-1,4),new i("ativă",-1,5),new i("itivă",-1,6)],F=[new i("ica",-1,1),new i("abila",-1,1),new i("ibila",-1,1),new i("oasa",-1,1),new i("ata",-1,1),new i("ita",-1,1),new i("anta",-1,1),new i("ista",-1,3),new i("uta",-1,1),new i("iva",-1,1),new i("ic",-1,1),new i("ice",-1,1),new i("abile",-1,1),new i("ibile",-1,1),new i("isme",-1,3),new i("iune",-1,2),new i("oase",-1,1),new i("ate",-1,1),new i("itate",17,1),new i("ite",-1,1),new i("ante",-1,1),new i("iste",-1,3),new i("ute",-1,1),new i("ive",-1,1),new i("ici",-1,1),new i("abili",-1,1),new i("ibili",-1,1),new i("iuni",-1,2),new i("atori",-1,1),new i("osi",-1,1),new i("ati",-1,1),new i("itati",30,1),new i("iti",-1,1),new i("anti",-1,1),new i("isti",-1,3),new i("uti",-1,1),new i("işti",-1,3),new i("ivi",-1,1),new i("ităi",-1,1),new i("oşi",-1,1),new i("ităţi",-1,1),new i("abil",-1,1),new i("ibil",-1,1),new i("ism",-1,3),new i("ator",-1,1),new i("os",-1,1),new i("at",-1,1),new i("it",-1,1),new i("ant",-1,1),new i("ist",-1,3),new i("ut",-1,1),new i("iv",-1,1),new i("ică",-1,1),new i("abilă",-1,1),new i("ibilă",-1,1),new i("oasă",-1,1),new i("ată",-1,1),new i("ită",-1,1),new i("antă",-1,1),new i("istă",-1,3),new i("ută",-1,1),new i("ivă",-1,1)],q=[new i("ea",-1,1),new i("ia",-1,1),new i("esc",-1,1),new i("ăsc",-1,1),new i("ind",-1,1),new i("ând",-1,1),new i("are",-1,1),new i("ere",-1,1),new i("ire",-1,1),new i("âre",-1,1),new i("se",-1,2),new i("ase",10,1),new i("sese",10,2),new i("ise",10,1),new i("use",10,1),new i("âse",10,1),new i("eşte",-1,1),new i("ăşte",-1,1),new i("eze",-1,1),new i("ai",-1,1),new i("eai",19,1),new i("iai",19,1),new i("sei",-1,2),new i("eşti",-1,1),new i("ăşti",-1,1),new i("ui",-1,1),new i("ezi",-1,1),new i("âi",-1,1),new i("aşi",-1,1),new i("seşi",-1,2),new i("aseşi",29,1),new i("seseşi",29,2),new i("iseşi",29,1),new i("useşi",29,1),new i("âseşi",29,1),new i("işi",-1,1),new i("uşi",-1,1),new i("âşi",-1,1),new i("aţi",-1,2),new i("eaţi",38,1),new i("iaţi",38,1),new i("eţi",-1,2),new i("iţi",-1,2),new i("âţi",-1,2),new i("arăţi",-1,1),new i("serăţi",-1,2),new i("aserăţi",45,1),new i("seserăţi",45,2),new i("iserăţi",45,1),new i("userăţi",45,1),new i("âserăţi",45,1),new i("irăţi",-1,1),new i("urăţi",-1,1),new i("ârăţi",-1,1),new i("am",-1,1),new i("eam",54,1),new i("iam",54,1),new i("em",-1,2),new i("asem",57,1),new i("sesem",57,2),new i("isem",57,1),new i("usem",57,1),new i("âsem",57,1),new i("im",-1,2),new i("âm",-1,2),new i("ăm",-1,2),new i("arăm",65,1),new i("serăm",65,2),new i("aserăm",67,1),new i("seserăm",67,2),new i("iserăm",67,1),new i("userăm",67,1),new i("âserăm",67,1),new i("irăm",65,1),new i("urăm",65,1),new i("ârăm",65,1),new i("au",-1,1),new i("eau",76,1),new i("iau",76,1),new i("indu",-1,1),new i("ându",-1,1),new i("ez",-1,1),new i("ească",-1,1),new i("ară",-1,1),new i("seră",-1,2),new i("aseră",84,1),new i("seseră",84,2),new i("iseră",84,1),new i("useră",84,1),new i("âseră",84,1),new i("iră",-1,1),new i("ură",-1,1),new i("âră",-1,1),new i("ează",-1,1)],S=[new i("a",-1,1),new i("e",-1,1),new i("ie",1,1),new i("i",-1,1),new i("ă",-1,1)],W=[17,65,16,0,0,0,0,0,0,0,0,0,0,0,0,0,2,32,0,0,4],L=new r;this.setCurrent=function(e){L.setCurrent(e)},this.getCurrent=function(){return L.getCurrent()},this.stem=function(){var e=L.cursor;return n(),L.cursor=e,c(),L.limit_backward=e,L.cursor=L.limit,f(),L.cursor=L.limit,d(),L.cursor=L.limit,_||(L.cursor=L.limit,b(),L.cursor=L.limit),v(),L.cursor=L.limit_backward,s(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return n.setCurrent(e),n.stem(),n.getCurrent()}):(n.setCurrent(e),n.stem(),n.getCurrent())}}(),e.Pipeline.registerFunction(e.ro.stemmer,"stemmer-ro"),e.ro.stopWordFilter=e.generateStopWordFilter("acea aceasta această aceea acei aceia acel acela acele acelea acest acesta aceste acestea aceşti aceştia acolo acord acum ai aia aibă aici al ale alea altceva altcineva am ar are asemenea asta astea astăzi asupra au avea avem aveţi azi aş aşadar aţi bine bucur bună ca care caut ce cel ceva chiar cinci cine cineva contra cu cum cumva curând curînd când cât câte câtva câţi cînd cît cîte cîtva cîţi că căci cărei căror cărui către da dacă dar datorită dată dau de deci deja deoarece departe deşi din dinaintea dintr- dintre doi doilea două drept după dă ea ei el ele eram este eu eşti face fata fi fie fiecare fii fim fiu fiţi frumos fără graţie halbă iar ieri la le li lor lui lângă lîngă mai mea mei mele mereu meu mi mie mine mult multă mulţi mulţumesc mâine mîine mă ne nevoie nici nicăieri nimeni nimeri nimic nişte noastre noastră noi noroc nostru nouă noştri nu opt ori oricare orice oricine oricum oricând oricât oricînd oricît oriunde patra patru patrulea pe pentru peste pic poate pot prea prima primul prin puţin puţina puţină până pînă rog sa sale sau se spate spre sub sunt suntem sunteţi sută sînt sîntem sînteţi să săi său ta tale te timp tine toate toată tot totuşi toţi trei treia treilea tu tăi tău un una unde undeva unei uneia unele uneori unii unor unora unu unui unuia unul vi voastre voastră voi vostru vouă voştri vreme vreo vreun vă zece zero zi zice îi îl îmi împotriva în  înainte înaintea încotro încât încît între întrucât întrucît îţi ăla ălea ăsta ăstea ăştia şapte şase şi ştiu ţi ţie".split(" ")),e.Pipeline.registerFunction(e.ro.stopWordFilter,"stopWordFilter-ro")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.ru.min.js b/assets/javascripts/lunr/min/lunr.ru.min.js
new file mode 100644
index 0000000..186cc48
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.ru.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `Russian` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(e,n){"function"==typeof define&&define.amd?define(n):"object"==typeof exports?module.exports=n():n()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.ru=function(){this.pipeline.reset(),this.pipeline.add(e.ru.trimmer,e.ru.stopWordFilter,e.ru.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.ru.stemmer))},e.ru.wordCharacters="Ѐ-҄҇-ԯᴫᵸⷠ-ⷿꙀ-ꚟ︮︯",e.ru.trimmer=e.trimmerSupport.generateTrimmer(e.ru.wordCharacters),e.Pipeline.registerFunction(e.ru.trimmer,"trimmer-ru"),e.ru.stemmer=function(){var n=e.stemmerSupport.Among,r=e.stemmerSupport.SnowballProgram,t=new function(){function e(){for(;!W.in_grouping(S,1072,1103);){if(W.cursor>=W.limit)return!1;W.cursor++}return!0}function t(){for(;!W.out_grouping(S,1072,1103);){if(W.cursor>=W.limit)return!1;W.cursor++}return!0}function w(){b=W.limit,_=b,e()&&(b=W.cursor,t()&&e()&&t()&&(_=W.cursor))}function i(){return _<=W.cursor}function u(e,n){var r,t;if(W.ket=W.cursor,r=W.find_among_b(e,n)){switch(W.bra=W.cursor,r){case 1:if(t=W.limit-W.cursor,!W.eq_s_b(1,"а")&&(W.cursor=W.limit-t,!W.eq_s_b(1,"я")))return!1;case 2:W.slice_del()}return!0}return!1}function o(){return u(h,9)}function s(e,n){var r;return W.ket=W.cursor,!!(r=W.find_among_b(e,n))&&(W.bra=W.cursor,1==r&&W.slice_del(),!0)}function c(){return s(g,26)}function m(){return!!c()&&(u(C,8),!0)}function f(){return s(k,2)}function l(){return u(P,46)}function a(){s(v,36)}function p(){var e;W.ket=W.cursor,(e=W.find_among_b(F,2))&&(W.bra=W.cursor,i()&&1==e&&W.slice_del())}function d(){var e;if(W.ket=W.cursor,e=W.find_among_b(q,4))switch(W.bra=W.cursor,e){case 1:if(W.slice_del(),W.ket=W.cursor,!W.eq_s_b(1,"н"))break;W.bra=W.cursor;case 2:if(!W.eq_s_b(1,"н"))break;case 3:W.slice_del()}}var _,b,h=[new n("в",-1,1),new n("ив",0,2),new n("ыв",0,2),new n("вши",-1,1),new n("ивши",3,2),new n("ывши",3,2),new n("вшись",-1,1),new n("ившись",6,2),new n("ывшись",6,2)],g=[new n("ее",-1,1),new n("ие",-1,1),new n("ое",-1,1),new n("ые",-1,1),new n("ими",-1,1),new n("ыми",-1,1),new n("ей",-1,1),new n("ий",-1,1),new n("ой",-1,1),new n("ый",-1,1),new n("ем",-1,1),new n("им",-1,1),new n("ом",-1,1),new n("ым",-1,1),new n("его",-1,1),new n("ого",-1,1),new n("ему",-1,1),new n("ому",-1,1),new n("их",-1,1),new n("ых",-1,1),new n("ею",-1,1),new n("ою",-1,1),new n("ую",-1,1),new n("юю",-1,1),new n("ая",-1,1),new n("яя",-1,1)],C=[new n("ем",-1,1),new n("нн",-1,1),new n("вш",-1,1),new n("ивш",2,2),new n("ывш",2,2),new n("щ",-1,1),new n("ющ",5,1),new n("ующ",6,2)],k=[new n("сь",-1,1),new n("ся",-1,1)],P=[new n("ла",-1,1),new n("ила",0,2),new n("ыла",0,2),new n("на",-1,1),new n("ена",3,2),new n("ете",-1,1),new n("ите",-1,2),new n("йте",-1,1),new n("ейте",7,2),new n("уйте",7,2),new n("ли",-1,1),new n("или",10,2),new n("ыли",10,2),new n("й",-1,1),new n("ей",13,2),new n("уй",13,2),new n("л",-1,1),new n("ил",16,2),new n("ыл",16,2),new n("ем",-1,1),new n("им",-1,2),new n("ым",-1,2),new n("н",-1,1),new n("ен",22,2),new n("ло",-1,1),new n("ило",24,2),new n("ыло",24,2),new n("но",-1,1),new n("ено",27,2),new n("нно",27,1),new n("ет",-1,1),new n("ует",30,2),new n("ит",-1,2),new n("ыт",-1,2),new n("ют",-1,1),new n("уют",34,2),new n("ят",-1,2),new n("ны",-1,1),new n("ены",37,2),new n("ть",-1,1),new n("ить",39,2),new n("ыть",39,2),new n("ешь",-1,1),new n("ишь",-1,2),new n("ю",-1,2),new n("ую",44,2)],v=[new n("а",-1,1),new n("ев",-1,1),new n("ов",-1,1),new n("е",-1,1),new n("ие",3,1),new n("ье",3,1),new n("и",-1,1),new n("еи",6,1),new n("ии",6,1),new n("ами",6,1),new n("ями",6,1),new n("иями",10,1),new n("й",-1,1),new n("ей",12,1),new n("ией",13,1),new n("ий",12,1),new n("ой",12,1),new n("ам",-1,1),new n("ем",-1,1),new n("ием",18,1),new n("ом",-1,1),new n("ям",-1,1),new n("иям",21,1),new n("о",-1,1),new n("у",-1,1),new n("ах",-1,1),new n("ях",-1,1),new n("иях",26,1),new n("ы",-1,1),new n("ь",-1,1),new n("ю",-1,1),new n("ию",30,1),new n("ью",30,1),new n("я",-1,1),new n("ия",33,1),new n("ья",33,1)],F=[new n("ост",-1,1),new n("ость",-1,1)],q=[new n("ейше",-1,1),new n("н",-1,2),new n("ейш",-1,1),new n("ь",-1,3)],S=[33,65,8,232],W=new r;this.setCurrent=function(e){W.setCurrent(e)},this.getCurrent=function(){return W.getCurrent()},this.stem=function(){return w(),W.cursor=W.limit,!(W.cursor<b)&&(W.limit_backward=b,o()||(W.cursor=W.limit,f()||(W.cursor=W.limit),m()||(W.cursor=W.limit,l()||(W.cursor=W.limit,a()))),W.cursor=W.limit,W.ket=W.cursor,W.eq_s_b(1,"и")?(W.bra=W.cursor,W.slice_del()):W.cursor=W.limit,p(),W.cursor=W.limit,d(),!0)}};return function(e){return"function"==typeof e.update?e.update(function(e){return t.setCurrent(e),t.stem(),t.getCurrent()}):(t.setCurrent(e),t.stem(),t.getCurrent())}}(),e.Pipeline.registerFunction(e.ru.stemmer,"stemmer-ru"),e.ru.stopWordFilter=e.generateStopWordFilter("алло без близко более больше будем будет будете будешь будто буду будут будь бы бывает бывь был была были было быть в важная важное важные важный вам вами вас ваш ваша ваше ваши вверх вдали вдруг ведь везде весь вниз внизу во вокруг вон восемнадцатый восемнадцать восемь восьмой вот впрочем времени время все всегда всего всем всеми всему всех всею всю всюду вся всё второй вы г где говорил говорит год года году да давно даже далеко дальше даром два двадцатый двадцать две двенадцатый двенадцать двух девятнадцатый девятнадцать девятый девять действительно дел день десятый десять для до довольно долго должно другая другие других друго другое другой е его ее ей ему если есть еще ещё ею её ж же жизнь за занят занята занято заняты затем зато зачем здесь значит и из или им именно иметь ими имя иногда их к каждая каждое каждые каждый кажется как какая какой кем когда кого ком кому конечно которая которого которой которые который которых кроме кругом кто куда лет ли лишь лучше люди м мало между меля менее меньше меня миллионов мимо мира мне много многочисленная многочисленное многочисленные многочисленный мной мною мог могут мож может можно можхо мои мой мор мочь моя моё мы на наверху над надо назад наиболее наконец нам нами нас начала наш наша наше наши не него недавно недалеко нее ней нельзя нем немного нему непрерывно нередко несколько нет нею неё ни нибудь ниже низко никогда никуда ними них ничего но ну нужно нх о об оба обычно один одиннадцатый одиннадцать однажды однако одного одной около он она они оно опять особенно от отовсюду отсюда очень первый перед по под пожалуйста позже пока пор пора после посреди потом потому почему почти прекрасно при про просто против процентов пятнадцатый пятнадцать пятый пять раз разве рано раньше рядом с сам сама сами самим самими самих само самого самой самом самому саму свое своего своей свои своих свою сеаой себе себя сегодня седьмой сейчас семнадцатый семнадцать семь сих сказал сказала сказать сколько слишком сначала снова со собой собою совсем спасибо стал суть т та так такая также такие такое такой там твой твоя твоё те тебе тебя тем теми теперь тех то тобой тобою тогда того тоже только том тому тот тою третий три тринадцатый тринадцать ту туда тут ты тысяч у уж уже уметь хорошо хотеть хоть хотя хочешь часто чаще чего человек чем чему через четвертый четыре четырнадцатый четырнадцать что чтоб чтобы чуть шестнадцатый шестнадцать шестой шесть эта эти этим этими этих это этого этой этом этому этот эту я \ufeffа".split(" ")),e.Pipeline.registerFunction(e.ru.stopWordFilter,"stopWordFilter-ru")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.sa.min.js b/assets/javascripts/lunr/min/lunr.sa.min.js
new file mode 100644
index 0000000..50ee564
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.sa.min.js
@@ -0,0 +1 @@
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.sa=function(){this.pipeline.reset(),this.pipeline.add(e.sa.trimmer,e.sa.stopWordFilter,e.sa.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.sa.stemmer))},e.sa.wordCharacters="ऀ-ःऄ-एऐ-टठ-यर-िी-ॏॐ-य़ॠ-९॰-ॿ꣠-꣱ꣲ-ꣷ꣸-ꣻ꣼-ꣽꣾ-ꣿᆰ0-ᆰ9",e.sa.trimmer=e.trimmerSupport.generateTrimmer(e.sa.wordCharacters),e.Pipeline.registerFunction(e.sa.trimmer,"trimmer-sa"),e.sa.stopWordFilter=e.generateStopWordFilter('तथा अयम्‌ एकम्‌ इत्यस्मिन्‌ तथा तत्‌ वा अयम्‌ इत्यस्य ते आहूत उपरि तेषाम्‌  किन्तु तेषाम्‌ तदा इत्यनेन अधिकः इत्यस्य तत्‌ केचन बहवः द्वि तथा महत्वपूर्णः अयम्‌ अस्य  विषये अयं अस्ति तत्‌ प्रथमः विषये इत्युपरि इत्युपरि इतर अधिकतमः अधिकः अपि सामान्यतया ठ इतरेतर नूतनम्‌ द  न्यूनम्‌ कश्चित्‌ वा विशालः द  सः अस्ति तदनुसारम् तत्र अस्ति केवलम्‌ अपि अत्र सर्वे विविधाः तत्‌ बहवः यतः इदानीम्‌ द  दक्षिण इत्यस्मै तस्य उपरि नथ अतीव कार्यम्‌ सर्वे एकैकम्‌ इत्यादि। एते सन्ति  उत इत्थम्‌ मध्ये एतदर्थं . स कस्य प्रथमः श्री. करोति अस्मिन् प्रकारः निर्मिता कालः तत्र कर्तुं  समान अधुना ते सन्ति स एकः अस्ति सः अर्थात् तेषां कृते . स्थितम्  विशेषः अग्रिम तेषाम्‌ समान स्रोतः ख म समान इदानीमपि अधिकतया करोतु ते समान इत्यस्य वीथी सह यस्मिन्  कृतवान्‌ धृतः तदा पुनः पूर्वं सः आगतः किम्‌ कुल इतर पुरा  मात्रा स विषये उ अतएव अपि नगरस्य  उपरि यतः प्रतिशतं  कतरः कालः साधनानि भूत तथापि जात सम्बन्धि अन्यत्‌ ग अतः अस्माकं स्वकीयाः अस्माकं इदानीं अन्तः इत्यादयः भवन्तः इत्यादयः एते एताः तस्य अस्य इदम् एते तेषां तेषां तेषां तान् तेषां तेषां तेषां समानः सः एकः च तादृशाः बहवः अन्ये च वदन्ति यत् कियत् कस्मै  कस्मै  यस्मै  यस्मै  यस्मै  यस्मै न अतिनीचः किन्तु प्रथमं सम्पूर्णतया  ततः चिरकालानन्तरं पुस्तकं सम्पूर्णतया अन्तः  किन्तु अत्र वा इह इव श्रद्धाय अवशिष्यते  परन्तु अन्ये वर्गाः सन्ति ते सन्ति शक्नुवन्ति सर्वे मिलित्वा सर्वे एकत्र"'.split(" ")),e.sa.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}();var r=e.wordcut;r.init(),e.sa.tokenizer=function(t){if(!arguments.length||null==t||void 0==t)return[];if(Array.isArray(t))return t.map(function(r){return isLunr2?new e.Token(r.toLowerCase()):r.toLowerCase()});var i=t.toString().toLowerCase().replace(/^\s+/,"");return r.cut(i).split("|")},e.Pipeline.registerFunction(e.sa.stemmer,"stemmer-sa"),e.Pipeline.registerFunction(e.sa.stopWordFilter,"stopWordFilter-sa")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.stemmer.support.min.js b/assets/javascripts/lunr/min/lunr.stemmer.support.min.js
new file mode 100644
index 0000000..abd4475
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.stemmer.support.min.js
@@ -0,0 +1 @@
+!function(r,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(r.lunr)}(this,function(){return function(r){r.stemmerSupport={Among:function(r,t,i,s){if(this.toCharArray=function(r){for(var t=r.length,i=new Array(t),s=0;s<t;s++)i[s]=r.charCodeAt(s);return i},!r&&""!=r||!t&&0!=t||!i)throw"Bad Among initialisation: s:"+r+", substring_i: "+t+", result: "+i;this.s_size=r.length,this.s=this.toCharArray(r),this.substring_i=t,this.result=i,this.method=s},SnowballProgram:function(){var r;return{bra:0,ket:0,limit:0,cursor:0,limit_backward:0,setCurrent:function(t){r=t,this.cursor=0,this.limit=t.length,this.limit_backward=0,this.bra=this.cursor,this.ket=this.limit},getCurrent:function(){var t=r;return r=null,t},in_grouping:function(t,i,s){if(this.cursor<this.limit){var e=r.charCodeAt(this.cursor);if(e<=s&&e>=i&&(e-=i,t[e>>3]&1<<(7&e)))return this.cursor++,!0}return!1},in_grouping_b:function(t,i,s){if(this.cursor>this.limit_backward){var e=r.charCodeAt(this.cursor-1);if(e<=s&&e>=i&&(e-=i,t[e>>3]&1<<(7&e)))return this.cursor--,!0}return!1},out_grouping:function(t,i,s){if(this.cursor<this.limit){var e=r.charCodeAt(this.cursor);if(e>s||e<i)return this.cursor++,!0;if(e-=i,!(t[e>>3]&1<<(7&e)))return this.cursor++,!0}return!1},out_grouping_b:function(t,i,s){if(this.cursor>this.limit_backward){var e=r.charCodeAt(this.cursor-1);if(e>s||e<i)return this.cursor--,!0;if(e-=i,!(t[e>>3]&1<<(7&e)))return this.cursor--,!0}return!1},eq_s:function(t,i){if(this.limit-this.cursor<t)return!1;for(var s=0;s<t;s++)if(r.charCodeAt(this.cursor+s)!=i.charCodeAt(s))return!1;return this.cursor+=t,!0},eq_s_b:function(t,i){if(this.cursor-this.limit_backward<t)return!1;for(var s=0;s<t;s++)if(r.charCodeAt(this.cursor-t+s)!=i.charCodeAt(s))return!1;return this.cursor-=t,!0},find_among:function(t,i){for(var s=0,e=i,n=this.cursor,u=this.limit,o=0,h=0,c=!1;;){for(var a=s+(e-s>>1),f=0,l=o<h?o:h,_=t[a],m=l;m<_.s_size;m++){if(n+l==u){f=-1;break}if(f=r.charCodeAt(n+l)-_.s[m])break;l++}if(f<0?(e=a,h=l):(s=a,o=l),e-s<=1){if(s>0||e==s||c)break;c=!0}}for(;;){var _=t[s];if(o>=_.s_size){if(this.cursor=n+_.s_size,!_.method)return _.result;var b=_.method();if(this.cursor=n+_.s_size,b)return _.result}if((s=_.substring_i)<0)return 0}},find_among_b:function(t,i){for(var s=0,e=i,n=this.cursor,u=this.limit_backward,o=0,h=0,c=!1;;){for(var a=s+(e-s>>1),f=0,l=o<h?o:h,_=t[a],m=_.s_size-1-l;m>=0;m--){if(n-l==u){f=-1;break}if(f=r.charCodeAt(n-1-l)-_.s[m])break;l++}if(f<0?(e=a,h=l):(s=a,o=l),e-s<=1){if(s>0||e==s||c)break;c=!0}}for(;;){var _=t[s];if(o>=_.s_size){if(this.cursor=n-_.s_size,!_.method)return _.result;var b=_.method();if(this.cursor=n-_.s_size,b)return _.result}if((s=_.substring_i)<0)return 0}},replace_s:function(t,i,s){var e=s.length-(i-t),n=r.substring(0,t),u=r.substring(i);return r=n+s+u,this.limit+=e,this.cursor>=i?this.cursor+=e:this.cursor>t&&(this.cursor=t),e},slice_check:function(){if(this.bra<0||this.bra>this.ket||this.ket>this.limit||this.limit>r.length)throw"faulty slice operation"},slice_from:function(r){this.slice_check(),this.replace_s(this.bra,this.ket,r)},slice_del:function(){this.slice_from("")},insert:function(r,t,i){var s=this.replace_s(r,t,i);r<=this.bra&&(this.bra+=s),r<=this.ket&&(this.ket+=s)},slice_to:function(){return this.slice_check(),r.substring(this.bra,this.ket)},eq_v_b:function(r){return this.eq_s_b(r.length,r)}}}},r.trimmerSupport={generateTrimmer:function(r){var t=new RegExp("^[^"+r+"]+"),i=new RegExp("[^"+r+"]+$");return function(r){return"function"==typeof r.update?r.update(function(r){return r.replace(t,"").replace(i,"")}):r.replace(t,"").replace(i,"")}}}}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.sv.min.js b/assets/javascripts/lunr/min/lunr.sv.min.js
new file mode 100644
index 0000000..3e5eb64
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.sv.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `Swedish` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.sv=function(){this.pipeline.reset(),this.pipeline.add(e.sv.trimmer,e.sv.stopWordFilter,e.sv.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.sv.stemmer))},e.sv.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.sv.trimmer=e.trimmerSupport.generateTrimmer(e.sv.wordCharacters),e.Pipeline.registerFunction(e.sv.trimmer,"trimmer-sv"),e.sv.stemmer=function(){var r=e.stemmerSupport.Among,n=e.stemmerSupport.SnowballProgram,t=new function(){function e(){var e,r=w.cursor+3;if(o=w.limit,0<=r||r<=w.limit){for(a=r;;){if(e=w.cursor,w.in_grouping(l,97,246)){w.cursor=e;break}if(w.cursor=e,w.cursor>=w.limit)return;w.cursor++}for(;!w.out_grouping(l,97,246);){if(w.cursor>=w.limit)return;w.cursor++}o=w.cursor,o<a&&(o=a)}}function t(){var e,r=w.limit_backward;if(w.cursor>=o&&(w.limit_backward=o,w.cursor=w.limit,w.ket=w.cursor,e=w.find_among_b(u,37),w.limit_backward=r,e))switch(w.bra=w.cursor,e){case 1:w.slice_del();break;case 2:w.in_grouping_b(d,98,121)&&w.slice_del()}}function i(){var e=w.limit_backward;w.cursor>=o&&(w.limit_backward=o,w.cursor=w.limit,w.find_among_b(c,7)&&(w.cursor=w.limit,w.ket=w.cursor,w.cursor>w.limit_backward&&(w.bra=--w.cursor,w.slice_del())),w.limit_backward=e)}function s(){var e,r;if(w.cursor>=o){if(r=w.limit_backward,w.limit_backward=o,w.cursor=w.limit,w.ket=w.cursor,e=w.find_among_b(m,5))switch(w.bra=w.cursor,e){case 1:w.slice_del();break;case 2:w.slice_from("lös");break;case 3:w.slice_from("full")}w.limit_backward=r}}var a,o,u=[new r("a",-1,1),new r("arna",0,1),new r("erna",0,1),new r("heterna",2,1),new r("orna",0,1),new r("ad",-1,1),new r("e",-1,1),new r("ade",6,1),new r("ande",6,1),new r("arne",6,1),new r("are",6,1),new r("aste",6,1),new r("en",-1,1),new r("anden",12,1),new r("aren",12,1),new r("heten",12,1),new r("ern",-1,1),new r("ar",-1,1),new r("er",-1,1),new r("heter",18,1),new r("or",-1,1),new r("s",-1,2),new r("as",21,1),new r("arnas",22,1),new r("ernas",22,1),new r("ornas",22,1),new r("es",21,1),new r("ades",26,1),new r("andes",26,1),new r("ens",21,1),new r("arens",29,1),new r("hetens",29,1),new r("erns",21,1),new r("at",-1,1),new r("andet",-1,1),new r("het",-1,1),new r("ast",-1,1)],c=[new r("dd",-1,-1),new r("gd",-1,-1),new r("nn",-1,-1),new r("dt",-1,-1),new r("gt",-1,-1),new r("kt",-1,-1),new r("tt",-1,-1)],m=[new r("ig",-1,1),new r("lig",0,1),new r("els",-1,1),new r("fullt",-1,3),new r("löst",-1,2)],l=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,24,0,32],d=[119,127,149],w=new n;this.setCurrent=function(e){w.setCurrent(e)},this.getCurrent=function(){return w.getCurrent()},this.stem=function(){var r=w.cursor;return e(),w.limit_backward=r,w.cursor=w.limit,t(),w.cursor=w.limit,i(),w.cursor=w.limit,s(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return t.setCurrent(e),t.stem(),t.getCurrent()}):(t.setCurrent(e),t.stem(),t.getCurrent())}}(),e.Pipeline.registerFunction(e.sv.stemmer,"stemmer-sv"),e.sv.stopWordFilter=e.generateStopWordFilter("alla allt att av blev bli blir blivit de dem den denna deras dess dessa det detta dig din dina ditt du där då efter ej eller en er era ert ett från för ha hade han hans har henne hennes hon honom hur här i icke ingen inom inte jag ju kan kunde man med mellan men mig min mina mitt mot mycket ni nu när någon något några och om oss på samma sedan sig sin sina sitta själv skulle som så sådan sådana sådant till under upp ut utan vad var vara varför varit varje vars vart vem vi vid vilka vilkas vilken vilket vår våra vårt än är åt över".split(" ")),e.Pipeline.registerFunction(e.sv.stopWordFilter,"stopWordFilter-sv")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.ta.min.js b/assets/javascripts/lunr/min/lunr.ta.min.js
new file mode 100644
index 0000000..a644bed
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.ta.min.js
@@ -0,0 +1 @@
+!function(e,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.ta=function(){this.pipeline.reset(),this.pipeline.add(e.ta.trimmer,e.ta.stopWordFilter,e.ta.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.ta.stemmer))},e.ta.wordCharacters="஀-உஊ-ஏஐ-ஙச-ட஠-னப-யர-ஹ஺-ிீ-௉ொ-௏ௐ-௙௚-௟௠-௩௪-௯௰-௹௺-௿a-zA-Zａ-ｚＡ-Ｚ0-9０-９",e.ta.trimmer=e.trimmerSupport.generateTrimmer(e.ta.wordCharacters),e.Pipeline.registerFunction(e.ta.trimmer,"trimmer-ta"),e.ta.stopWordFilter=e.generateStopWordFilter("அங்கு அங்கே அது அதை அந்த அவர் அவர்கள் அவள் அவன் அவை ஆக ஆகவே ஆகையால் ஆதலால் ஆதலினால் ஆனாலும் ஆனால் இங்கு இங்கே இது இதை இந்த இப்படி இவர் இவர்கள் இவள் இவன் இவை இவ்வளவு உனக்கு உனது உன் உன்னால் எங்கு எங்கே எது எதை எந்த எப்படி எவர் எவர்கள் எவள் எவன் எவை எவ்வளவு எனக்கு எனது எனவே என் என்ன என்னால் ஏது ஏன் தனது தன்னால் தானே தான் நாங்கள் நாம் நான் நீ நீங்கள்".split(" ")),e.ta.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}();var t=e.wordcut;t.init(),e.ta.tokenizer=function(r){if(!arguments.length||null==r||void 0==r)return[];if(Array.isArray(r))return r.map(function(t){return isLunr2?new e.Token(t.toLowerCase()):t.toLowerCase()});var i=r.toString().toLowerCase().replace(/^\s+/,"");return t.cut(i).split("|")},e.Pipeline.registerFunction(e.ta.stemmer,"stemmer-ta"),e.Pipeline.registerFunction(e.ta.stopWordFilter,"stopWordFilter-ta")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.te.min.js b/assets/javascripts/lunr/min/lunr.te.min.js
new file mode 100644
index 0000000..9fa7a93
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.te.min.js
@@ -0,0 +1 @@
+!function(e,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.te=function(){this.pipeline.reset(),this.pipeline.add(e.te.trimmer,e.te.stopWordFilter,e.te.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.te.stemmer))},e.te.wordCharacters="ఀ-ఄఅ-ఔక-హా-ౌౕ-ౖౘ-ౚౠ-ౡౢ-ౣ౦-౯౸-౿఼ఽ్ౝ౷౤౥",e.te.trimmer=e.trimmerSupport.generateTrimmer(e.te.wordCharacters),e.Pipeline.registerFunction(e.te.trimmer,"trimmer-te"),e.te.stopWordFilter=e.generateStopWordFilter("అందరూ అందుబాటులో అడగండి అడగడం అడ్డంగా అనుగుణంగా అనుమతించు అనుమతిస్తుంది అయితే ఇప్పటికే ఉన్నారు ఎక్కడైనా ఎప్పుడు ఎవరైనా ఎవరో ఏ ఏదైనా ఏమైనప్పటికి ఒక ఒకరు కనిపిస్తాయి కాదు కూడా గా గురించి చుట్టూ చేయగలిగింది తగిన తర్వాత దాదాపు దూరంగా నిజంగా పై ప్రకారం ప్రక్కన మధ్య మరియు మరొక మళ్ళీ మాత్రమే మెచ్చుకో వద్ద వెంట వేరుగా వ్యతిరేకంగా సంబంధం".split(" ")),e.te.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}();var t=e.wordcut;t.init(),e.te.tokenizer=function(r){if(!arguments.length||null==r||void 0==r)return[];if(Array.isArray(r))return r.map(function(t){return isLunr2?new e.Token(t.toLowerCase()):t.toLowerCase()});var i=r.toString().toLowerCase().replace(/^\s+/,"");return t.cut(i).split("|")},e.Pipeline.registerFunction(e.te.stemmer,"stemmer-te"),e.Pipeline.registerFunction(e.te.stopWordFilter,"stopWordFilter-te")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.th.min.js b/assets/javascripts/lunr/min/lunr.th.min.js
new file mode 100644
index 0000000..dee3aac
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.th.min.js
@@ -0,0 +1 @@
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");var r="2"==e.version[0];e.th=function(){this.pipeline.reset(),this.pipeline.add(e.th.trimmer),r?this.tokenizer=e.th.tokenizer:(e.tokenizer&&(e.tokenizer=e.th.tokenizer),this.tokenizerFn&&(this.tokenizerFn=e.th.tokenizer))},e.th.wordCharacters="[฀-๿]",e.th.trimmer=e.trimmerSupport.generateTrimmer(e.th.wordCharacters),e.Pipeline.registerFunction(e.th.trimmer,"trimmer-th");var t=e.wordcut;t.init(),e.th.tokenizer=function(i){if(!arguments.length||null==i||void 0==i)return[];if(Array.isArray(i))return i.map(function(t){return r?new e.Token(t):t});var n=i.toString().replace(/^\s+/,"");return t.cut(n).split("|")}}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.tr.min.js b/assets/javascripts/lunr/min/lunr.tr.min.js
new file mode 100644
index 0000000..563f6ec
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.tr.min.js
@@ -0,0 +1,18 @@
+/*!
+ * Lunr languages, `Turkish` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2014, Mihai Valentin
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+!function(r,i){"function"==typeof define&&define.amd?define(i):"object"==typeof exports?module.exports=i():i()(r.lunr)}(this,function(){return function(r){if(void 0===r)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===r.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");r.tr=function(){this.pipeline.reset(),this.pipeline.add(r.tr.trimmer,r.tr.stopWordFilter,r.tr.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(r.tr.stemmer))},r.tr.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",r.tr.trimmer=r.trimmerSupport.generateTrimmer(r.tr.wordCharacters),r.Pipeline.registerFunction(r.tr.trimmer,"trimmer-tr"),r.tr.stemmer=function(){var i=r.stemmerSupport.Among,e=r.stemmerSupport.SnowballProgram,n=new function(){function r(r,i,e){for(;;){var n=Dr.limit-Dr.cursor;if(Dr.in_grouping_b(r,i,e)){Dr.cursor=Dr.limit-n;break}if(Dr.cursor=Dr.limit-n,Dr.cursor<=Dr.limit_backward)return!1;Dr.cursor--}return!0}function n(){var i,e;i=Dr.limit-Dr.cursor,r(Wr,97,305);for(var n=0;n<Br.length;n++){e=Dr.limit-Dr.cursor;var t=Br[n];if(Dr.eq_s_b(1,t[0])&&r(t[1],t[2],t[3]))return Dr.cursor=Dr.limit-i,!0;Dr.cursor=Dr.limit-e}return Dr.cursor=Dr.limit-e,!(!Dr.eq_s_b(1,"ü")||!r(Zr,246,252))&&(Dr.cursor=Dr.limit-i,!0)}function t(r,i){var e,n=Dr.limit-Dr.cursor;return r()&&(Dr.cursor=Dr.limit-n,Dr.cursor>Dr.limit_backward&&(Dr.cursor--,e=Dr.limit-Dr.cursor,i()))?(Dr.cursor=Dr.limit-e,!0):(Dr.cursor=Dr.limit-n,r()?(Dr.cursor=Dr.limit-n,!1):(Dr.cursor=Dr.limit-n,!(Dr.cursor<=Dr.limit_backward)&&(Dr.cursor--,!!i()&&(Dr.cursor=Dr.limit-n,!0))))}function u(r){return t(r,function(){return Dr.in_grouping_b(Wr,97,305)})}function o(){return u(function(){return Dr.eq_s_b(1,"n")})}function s(){return u(function(){return Dr.eq_s_b(1,"s")})}function c(){return u(function(){return Dr.eq_s_b(1,"y")})}function l(){return t(function(){return Dr.in_grouping_b(Lr,105,305)},function(){return Dr.out_grouping_b(Wr,97,305)})}function a(){return Dr.find_among_b(ur,10)&&l()}function m(){return n()&&Dr.in_grouping_b(Lr,105,305)&&s()}function d(){return Dr.find_among_b(or,2)}function f(){return n()&&Dr.in_grouping_b(Lr,105,305)&&c()}function b(){return n()&&Dr.find_among_b(sr,4)}function w(){return n()&&Dr.find_among_b(cr,4)&&o()}function _(){return n()&&Dr.find_among_b(lr,2)&&c()}function k(){return n()&&Dr.find_among_b(ar,2)}function p(){return n()&&Dr.find_among_b(mr,4)}function g(){return n()&&Dr.find_among_b(dr,2)}function y(){return n()&&Dr.find_among_b(fr,4)}function z(){return n()&&Dr.find_among_b(br,2)}function v(){return n()&&Dr.find_among_b(wr,2)&&c()}function h(){return Dr.eq_s_b(2,"ki")}function q(){return n()&&Dr.find_among_b(_r,2)&&o()}function C(){return n()&&Dr.find_among_b(kr,4)&&c()}function P(){return n()&&Dr.find_among_b(pr,4)}function F(){return n()&&Dr.find_among_b(gr,4)&&c()}function S(){return Dr.find_among_b(yr,4)}function W(){return n()&&Dr.find_among_b(zr,2)}function L(){return n()&&Dr.find_among_b(vr,4)}function x(){return n()&&Dr.find_among_b(hr,8)}function A(){return Dr.find_among_b(qr,2)}function E(){return n()&&Dr.find_among_b(Cr,32)&&c()}function j(){return Dr.find_among_b(Pr,8)&&c()}function T(){return n()&&Dr.find_among_b(Fr,4)&&c()}function Z(){return Dr.eq_s_b(3,"ken")&&c()}function B(){var r=Dr.limit-Dr.cursor;return!(T()||(Dr.cursor=Dr.limit-r,E()||(Dr.cursor=Dr.limit-r,j()||(Dr.cursor=Dr.limit-r,Z()))))}function D(){if(A()){var r=Dr.limit-Dr.cursor;if(S()||(Dr.cursor=Dr.limit-r,W()||(Dr.cursor=Dr.limit-r,C()||(Dr.cursor=Dr.limit-r,P()||(Dr.cursor=Dr.limit-r,F()||(Dr.cursor=Dr.limit-r))))),T())return!1}return!0}function G(){if(W()){Dr.bra=Dr.cursor,Dr.slice_del();var r=Dr.limit-Dr.cursor;return Dr.ket=Dr.cursor,x()||(Dr.cursor=Dr.limit-r,E()||(Dr.cursor=Dr.limit-r,j()||(Dr.cursor=Dr.limit-r,T()||(Dr.cursor=Dr.limit-r)))),nr=!1,!1}return!0}function H(){if(!L())return!0;var r=Dr.limit-Dr.cursor;return!E()&&(Dr.cursor=Dr.limit-r,!j())}function I(){var r,i=Dr.limit-Dr.cursor;return!(S()||(Dr.cursor=Dr.limit-i,F()||(Dr.cursor=Dr.limit-i,P()||(Dr.cursor=Dr.limit-i,C()))))||(Dr.bra=Dr.cursor,Dr.slice_del(),r=Dr.limit-Dr.cursor,Dr.ket=Dr.cursor,T()||(Dr.cursor=Dr.limit-r),!1)}function J(){var r,i=Dr.limit-Dr.cursor;if(Dr.ket=Dr.cursor,nr=!0,B()&&(Dr.cursor=Dr.limit-i,D()&&(Dr.cursor=Dr.limit-i,G()&&(Dr.cursor=Dr.limit-i,H()&&(Dr.cursor=Dr.limit-i,I()))))){if(Dr.cursor=Dr.limit-i,!x())return;Dr.bra=Dr.cursor,Dr.slice_del(),Dr.ket=Dr.cursor,r=Dr.limit-Dr.cursor,S()||(Dr.cursor=Dr.limit-r,W()||(Dr.cursor=Dr.limit-r,C()||(Dr.cursor=Dr.limit-r,P()||(Dr.cursor=Dr.limit-r,F()||(Dr.cursor=Dr.limit-r))))),T()||(Dr.cursor=Dr.limit-r)}Dr.bra=Dr.cursor,Dr.slice_del()}function K(){var r,i,e,n;if(Dr.ket=Dr.cursor,h()){if(r=Dr.limit-Dr.cursor,p())return Dr.bra=Dr.cursor,Dr.slice_del(),i=Dr.limit-Dr.cursor,Dr.ket=Dr.cursor,W()?(Dr.bra=Dr.cursor,Dr.slice_del(),K()):(Dr.cursor=Dr.limit-i,a()&&(Dr.bra=Dr.cursor,Dr.slice_del(),Dr.ket=Dr.cursor,W()&&(Dr.bra=Dr.cursor,Dr.slice_del(),K()))),!0;if(Dr.cursor=Dr.limit-r,w()){if(Dr.bra=Dr.cursor,Dr.slice_del(),Dr.ket=Dr.cursor,e=Dr.limit-Dr.cursor,d())Dr.bra=Dr.cursor,Dr.slice_del();else{if(Dr.cursor=Dr.limit-e,Dr.ket=Dr.cursor,!a()&&(Dr.cursor=Dr.limit-e,!m()&&(Dr.cursor=Dr.limit-e,!K())))return!0;Dr.bra=Dr.cursor,Dr.slice_del(),Dr.ket=Dr.cursor,W()&&(Dr.bra=Dr.cursor,Dr.slice_del(),K())}return!0}if(Dr.cursor=Dr.limit-r,g()){if(n=Dr.limit-Dr.cursor,d())Dr.bra=Dr.cursor,Dr.slice_del();else if(Dr.cursor=Dr.limit-n,m())Dr.bra=Dr.cursor,Dr.slice_del(),Dr.ket=Dr.cursor,W()&&(Dr.bra=Dr.cursor,Dr.slice_del(),K());else if(Dr.cursor=Dr.limit-n,!K())return!1;return!0}}return!1}function M(r){if(Dr.ket=Dr.cursor,!g()&&(Dr.cursor=Dr.limit-r,!k()))return!1;var i=Dr.limit-Dr.cursor;if(d())Dr.bra=Dr.cursor,Dr.slice_del();else if(Dr.cursor=Dr.limit-i,m())Dr.bra=Dr.cursor,Dr.slice_del(),Dr.ket=Dr.cursor,W()&&(Dr.bra=Dr.cursor,Dr.slice_del(),K());else if(Dr.cursor=Dr.limit-i,!K())return!1;return!0}function N(r){if(Dr.ket=Dr.cursor,!z()&&(Dr.cursor=Dr.limit-r,!b()))return!1;var i=Dr.limit-Dr.cursor;return!(!m()&&(Dr.cursor=Dr.limit-i,!d()))&&(Dr.bra=Dr.cursor,Dr.slice_del(),Dr.ket=Dr.cursor,W()&&(Dr.bra=Dr.cursor,Dr.slice_del(),K()),!0)}function O(){var r,i=Dr.limit-Dr.cursor;return Dr.ket=Dr.cursor,!(!w()&&(Dr.cursor=Dr.limit-i,!v()))&&(Dr.bra=Dr.cursor,Dr.slice_del(),r=Dr.limit-Dr.cursor,Dr.ket=Dr.cursor,!(!W()||(Dr.bra=Dr.cursor,Dr.slice_del(),!K()))||(Dr.cursor=Dr.limit-r,Dr.ket=Dr.cursor,!(a()||(Dr.cursor=Dr.limit-r,m()||(Dr.cursor=Dr.limit-r,K())))||(Dr.bra=Dr.cursor,Dr.slice_del(),Dr.ket=Dr.cursor,W()&&(Dr.bra=Dr.cursor,Dr.slice_del(),K()),!0)))}function Q(){var r,i,e=Dr.limit-Dr.cursor;if(Dr.ket=Dr.cursor,!p()&&(Dr.cursor=Dr.limit-e,!f()&&(Dr.cursor=Dr.limit-e,!_())))return!1;if(Dr.bra=Dr.cursor,Dr.slice_del(),Dr.ket=Dr.cursor,r=Dr.limit-Dr.cursor,a())Dr.bra=Dr.cursor,Dr.slice_del(),i=Dr.limit-Dr.cursor,Dr.ket=Dr.cursor,W()||(Dr.cursor=Dr.limit-i);else if(Dr.cursor=Dr.limit-r,!W())return!0;return Dr.bra=Dr.cursor,Dr.slice_del(),Dr.ket=Dr.cursor,K(),!0}function R(){var r,i,e=Dr.limit-Dr.cursor;if(Dr.ket=Dr.cursor,W())return Dr.bra=Dr.cursor,Dr.slice_del(),void K();if(Dr.cursor=Dr.limit-e,Dr.ket=Dr.cursor,q())if(Dr.bra=Dr.cursor,Dr.slice_del(),r=Dr.limit-Dr.cursor,Dr.ket=Dr.cursor,d())Dr.bra=Dr.cursor,Dr.slice_del();else{if(Dr.cursor=Dr.limit-r,Dr.ket=Dr.cursor,!a()&&(Dr.cursor=Dr.limit-r,!m())){if(Dr.cursor=Dr.limit-r,Dr.ket=Dr.cursor,!W())return;if(Dr.bra=Dr.cursor,Dr.slice_del(),!K())return}Dr.bra=Dr.cursor,Dr.slice_del(),Dr.ket=Dr.cursor,W()&&(Dr.bra=Dr.cursor,Dr.slice_del(),K())}else if(Dr.cursor=Dr.limit-e,!M(e)&&(Dr.cursor=Dr.limit-e,!N(e))){if(Dr.cursor=Dr.limit-e,Dr.ket=Dr.cursor,y())return Dr.bra=Dr.cursor,Dr.slice_del(),Dr.ket=Dr.cursor,i=Dr.limit-Dr.cursor,void(a()?(Dr.bra=Dr.cursor,Dr.slice_del(),Dr.ket=Dr.cursor,W()&&(Dr.bra=Dr.cursor,Dr.slice_del(),K())):(Dr.cursor=Dr.limit-i,W()?(Dr.bra=Dr.cursor,Dr.slice_del(),K()):(Dr.cursor=Dr.limit-i,K())));if(Dr.cursor=Dr.limit-e,!O()){if(Dr.cursor=Dr.limit-e,d())return Dr.bra=Dr.cursor,void Dr.slice_del();Dr.cursor=Dr.limit-e,K()||(Dr.cursor=Dr.limit-e,Q()||(Dr.cursor=Dr.limit-e,Dr.ket=Dr.cursor,(a()||(Dr.cursor=Dr.limit-e,m()))&&(Dr.bra=Dr.cursor,Dr.slice_del(),Dr.ket=Dr.cursor,W()&&(Dr.bra=Dr.cursor,Dr.slice_del(),K()))))}}}function U(){var r;if(Dr.ket=Dr.cursor,r=Dr.find_among_b(Sr,4))switch(Dr.bra=Dr.cursor,r){case 1:Dr.slice_from("p");break;case 2:Dr.slice_from("ç");break;case 3:Dr.slice_from("t");break;case 4:Dr.slice_from("k")}}function V(){for(;;){var r=Dr.limit-Dr.cursor;if(Dr.in_grouping_b(Wr,97,305)){Dr.cursor=Dr.limit-r;break}if(Dr.cursor=Dr.limit-r,Dr.cursor<=Dr.limit_backward)return!1;Dr.cursor--}return!0}function X(r,i,e){if(Dr.cursor=Dr.limit-r,V()){var n=Dr.limit-Dr.cursor;if(!Dr.eq_s_b(1,i)&&(Dr.cursor=Dr.limit-n,!Dr.eq_s_b(1,e)))return!0;Dr.cursor=Dr.limit-r;var t=Dr.cursor;return Dr.insert(Dr.cursor,Dr.cursor,e),Dr.cursor=t,!1}return!0}function Y(){var r=Dr.limit-Dr.cursor;(Dr.eq_s_b(1,"d")||(Dr.cursor=Dr.limit-r,Dr.eq_s_b(1,"g")))&&X(r,"a","ı")&&X(r,"e","i")&&X(r,"o","u")&&X(r,"ö","ü")}function $(){for(var r,i=Dr.cursor,e=2;;){for(r=Dr.cursor;!Dr.in_grouping(Wr,97,305);){if(Dr.cursor>=Dr.limit)return Dr.cursor=r,!(e>0)&&(Dr.cursor=i,!0);Dr.cursor++}e--}}function rr(r,i,e){for(;!Dr.eq_s(i,e);){if(Dr.cursor>=Dr.limit)return!0;Dr.cursor++}return(tr=i)!=Dr.limit||(Dr.cursor=r,!1)}function ir(){var r=Dr.cursor;return!rr(r,2,"ad")||(Dr.cursor=r,!rr(r,5,"soyad"))}function er(){var r=Dr.cursor;return!ir()&&(Dr.limit_backward=r,Dr.cursor=Dr.limit,Y(),Dr.cursor=Dr.limit,U(),!0)}var nr,tr,ur=[new i("m",-1,-1),new i("n",-1,-1),new i("miz",-1,-1),new i("niz",-1,-1),new i("muz",-1,-1),new i("nuz",-1,-1),new i("müz",-1,-1),new i("nüz",-1,-1),new i("mız",-1,-1),new i("nız",-1,-1)],or=[new i("leri",-1,-1),new i("ları",-1,-1)],sr=[new i("ni",-1,-1),new i("nu",-1,-1),new i("nü",-1,-1),new i("nı",-1,-1)],cr=[new i("in",-1,-1),new i("un",-1,-1),new i("ün",-1,-1),new i("ın",-1,-1)],lr=[new i("a",-1,-1),new i("e",-1,-1)],ar=[new i("na",-1,-1),new i("ne",-1,-1)],mr=[new i("da",-1,-1),new i("ta",-1,-1),new i("de",-1,-1),new i("te",-1,-1)],dr=[new i("nda",-1,-1),new i("nde",-1,-1)],fr=[new i("dan",-1,-1),new i("tan",-1,-1),new i("den",-1,-1),new i("ten",-1,-1)],br=[new i("ndan",-1,-1),new i("nden",-1,-1)],wr=[new i("la",-1,-1),new i("le",-1,-1)],_r=[new i("ca",-1,-1),new i("ce",-1,-1)],kr=[new i("im",-1,-1),new i("um",-1,-1),new i("üm",-1,-1),new i("ım",-1,-1)],pr=[new i("sin",-1,-1),new i("sun",-1,-1),new i("sün",-1,-1),new i("sın",-1,-1)],gr=[new i("iz",-1,-1),new i("uz",-1,-1),new i("üz",-1,-1),new i("ız",-1,-1)],yr=[new i("siniz",-1,-1),new i("sunuz",-1,-1),new i("sünüz",-1,-1),new i("sınız",-1,-1)],zr=[new i("lar",-1,-1),new i("ler",-1,-1)],vr=[new i("niz",-1,-1),new i("nuz",-1,-1),new i("nüz",-1,-1),new i("nız",-1,-1)],hr=[new i("dir",-1,-1),new i("tir",-1,-1),new i("dur",-1,-1),new i("tur",-1,-1),new i("dür",-1,-1),new i("tür",-1,-1),new i("dır",-1,-1),new i("tır",-1,-1)],qr=[new i("casına",-1,-1),new i("cesine",-1,-1)],Cr=[new i("di",-1,-1),new i("ti",-1,-1),new i("dik",-1,-1),new i("tik",-1,-1),new i("duk",-1,-1),new i("tuk",-1,-1),new i("dük",-1,-1),new i("tük",-1,-1),new i("dık",-1,-1),new i("tık",-1,-1),new i("dim",-1,-1),new i("tim",-1,-1),new i("dum",-1,-1),new i("tum",-1,-1),new i("düm",-1,-1),new i("tüm",-1,-1),new i("dım",-1,-1),new i("tım",-1,-1),new i("din",-1,-1),new i("tin",-1,-1),new i("dun",-1,-1),new i("tun",-1,-1),new i("dün",-1,-1),new i("tün",-1,-1),new i("dın",-1,-1),new i("tın",-1,-1),new i("du",-1,-1),new i("tu",-1,-1),new i("dü",-1,-1),new i("tü",-1,-1),new i("dı",-1,-1),new i("tı",-1,-1)],Pr=[new i("sa",-1,-1),new i("se",-1,-1),new i("sak",-1,-1),new i("sek",-1,-1),new i("sam",-1,-1),new i("sem",-1,-1),new i("san",-1,-1),new i("sen",-1,-1)],Fr=[new i("miş",-1,-1),new i("muş",-1,-1),new i("müş",-1,-1),new i("mış",-1,-1)],Sr=[new i("b",-1,1),new i("c",-1,2),new i("d",-1,3),new i("ğ",-1,4)],Wr=[17,65,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,8,0,0,0,0,0,0,1],Lr=[1,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,1],xr=[1,64,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1],Ar=[17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,130],Er=[1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1],jr=[17],Tr=[65],Zr=[65],Br=[["a",xr,97,305],["e",Ar,101,252],["ı",Er,97,305],["i",jr,101,105],["o",Tr,111,117],["ö",Zr,246,252],["u",Tr,111,117]],Dr=new e;this.setCurrent=function(r){Dr.setCurrent(r)},this.getCurrent=function(){return Dr.getCurrent()},this.stem=function(){return!!($()&&(Dr.limit_backward=Dr.cursor,Dr.cursor=Dr.limit,J(),Dr.cursor=Dr.limit,nr&&(R(),Dr.cursor=Dr.limit_backward,er())))}};return function(r){return"function"==typeof r.update?r.update(function(r){return n.setCurrent(r),n.stem(),n.getCurrent()}):(n.setCurrent(r),n.stem(),n.getCurrent())}}(),r.Pipeline.registerFunction(r.tr.stemmer,"stemmer-tr"),r.tr.stopWordFilter=r.generateStopWordFilter("acaba altmış altı ama ancak arada aslında ayrıca bana bazı belki ben benden beni benim beri beş bile bin bir biri birkaç birkez birçok birşey birşeyi biz bizden bize bizi bizim bu buna bunda bundan bunlar bunları bunların bunu bunun burada böyle böylece da daha dahi de defa değil diye diğer doksan dokuz dolayı dolayısıyla dört edecek eden ederek edilecek ediliyor edilmesi ediyor elli en etmesi etti ettiği ettiğini eğer gibi göre halen hangi hatta hem henüz hep hepsi her herhangi herkesin hiç hiçbir iki ile ilgili ise itibaren itibariyle için işte kadar karşın katrilyon kendi kendilerine kendini kendisi kendisine kendisini kez ki kim kimden kime kimi kimse kırk milyar milyon mu mü mı nasıl ne neden nedenle nerde nerede nereye niye niçin o olan olarak oldu olduklarını olduğu olduğunu olmadı olmadığı olmak olması olmayan olmaz olsa olsun olup olur olursa oluyor on ona ondan onlar onlardan onları onların onu onun otuz oysa pek rağmen sadece sanki sekiz seksen sen senden seni senin siz sizden sizi sizin tarafından trilyon tüm var vardı ve veya ya yani yapacak yapmak yaptı yaptıkları yaptığı yaptığını yapılan yapılması yapıyor yedi yerine yetmiş yine yirmi yoksa yüz zaten çok çünkü öyle üzere üç şey şeyden şeyi şeyler şu şuna şunda şundan şunları şunu şöyle".split(" ")),r.Pipeline.registerFunction(r.tr.stopWordFilter,"stopWordFilter-tr")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.vi.min.js b/assets/javascripts/lunr/min/lunr.vi.min.js
new file mode 100644
index 0000000..22aed28
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.vi.min.js
@@ -0,0 +1 @@
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.vi=function(){this.pipeline.reset(),this.pipeline.add(e.vi.stopWordFilter,e.vi.trimmer)},e.vi.wordCharacters="[A-Za-ẓ̀͐́͑̉̃̓ÂâÊêÔôĂ-ăĐ-đƠ-ơƯ-ư]",e.vi.trimmer=e.trimmerSupport.generateTrimmer(e.vi.wordCharacters),e.Pipeline.registerFunction(e.vi.trimmer,"trimmer-vi"),e.vi.stopWordFilter=e.generateStopWordFilter("là cái nhưng mà".split(" "))}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/min/lunr.zh.min.js b/assets/javascripts/lunr/min/lunr.zh.min.js
new file mode 100644
index 0000000..fda66e9
--- /dev/null
+++ b/assets/javascripts/lunr/min/lunr.zh.min.js
@@ -0,0 +1 @@
+!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r(require("@node-rs/jieba")):r()(e.lunr)}(this,function(e){return function(r,t){if(void 0===r)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===r.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");var i="2"==r.version[0];r.zh=function(){this.pipeline.reset(),this.pipeline.add(r.zh.trimmer,r.zh.stopWordFilter,r.zh.stemmer),i?this.tokenizer=r.zh.tokenizer:(r.tokenizer&&(r.tokenizer=r.zh.tokenizer),this.tokenizerFn&&(this.tokenizerFn=r.zh.tokenizer))},r.zh.tokenizer=function(n){if(!arguments.length||null==n||void 0==n)return[];if(Array.isArray(n))return n.map(function(e){return i?new r.Token(e.toLowerCase()):e.toLowerCase()});t&&e.load(t);var o=n.toString().trim().toLowerCase(),s=[];e.cut(o,!0).forEach(function(e){s=s.concat(e.split(" "))}),s=s.filter(function(e){return!!e});var u=0;return s.map(function(e,t){if(i){var n=o.indexOf(e,u),s={};return s.position=[n,e.length],s.index=t,u=n,new r.Token(e,s)}return e})},r.zh.wordCharacters="\\w一-龥",r.zh.trimmer=r.trimmerSupport.generateTrimmer(r.zh.wordCharacters),r.Pipeline.registerFunction(r.zh.trimmer,"trimmer-zh"),r.zh.stemmer=function(){return function(e){return e}}(),r.Pipeline.registerFunction(r.zh.stemmer,"stemmer-zh"),r.zh.stopWordFilter=r.generateStopWordFilter("的 一 不 在 人 有 是 为 為 以 于 於 上 他 而 后 後 之 来 來 及 了 因 下 可 到 由 这 這 与 與 也 此 但 并 並 个 個 其 已 无 無 小 我 们 們 起 最 再 今 去 好 只 又 或 很 亦 某 把 那 你 乃 它 吧 被 比 别 趁 当 當 从 從 得 打 凡 儿 兒 尔 爾 该 該 各 给 給 跟 和 何 还 還 即 几 幾 既 看 据 據 距 靠 啦 另 么 麽 每 嘛 拿 哪 您 凭 憑 且 却 卻 让 讓 仍 啥 如 若 使 谁 誰 虽 雖 随 隨 同 所 她 哇 嗡 往 些 向 沿 哟 喲 用 咱 则 則 怎 曾 至 致 着 著 诸 諸 自".split(" ")),r.Pipeline.registerFunction(r.zh.stopWordFilter,"stopWordFilter-zh")}});
\ No newline at end of file
diff --git a/assets/javascripts/lunr/tinyseg.js b/assets/javascripts/lunr/tinyseg.js
new file mode 100644
index 0000000..167fa6d
--- /dev/null
+++ b/assets/javascripts/lunr/tinyseg.js
@@ -0,0 +1,206 @@
+/**
+ * export the module via AMD, CommonJS or as a browser global
+ * Export code from https://github.com/umdjs/umd/blob/master/returnExports.js
+ */
+;(function (root, factory) {
+    if (typeof define === 'function' && define.amd) {
+        // AMD. Register as an anonymous module.
+        define(factory)
+    } else if (typeof exports === 'object') {
+        /**
+         * Node. Does not work with strict CommonJS, but
+         * only CommonJS-like environments that support module.exports,
+         * like Node.
+         */
+        module.exports = factory()
+    } else {
+        // Browser globals (root is window)
+        factory()(root.lunr);
+    }
+}(this, function () {
+    /**
+     * Just return a value to define the module export.
+     * This example returns an object, but the module
+     * can return a function as the exported value.
+     */
+
+    return function(lunr) {
+        // TinySegmenter 0.1 -- Super compact Japanese tokenizer in Javascript
+        // (c) 2008 Taku Kudo <taku@chasen.org>
+        // TinySegmenter is freely distributable under the terms of a new BSD licence.
+        // For details, see http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt
+
+        function TinySegmenter() {
+          var patterns = {
+            "[一二三四五六七八九十百千万億兆]":"M",
+            "[一-龠々〆ヵヶ]":"H",
+            "[ぁ-ん]":"I",
+            "[ァ-ヴーｱ-ﾝﾞｰ]":"K",
+            "[a-zA-Zａ-ｚＡ-Ｚ]":"A",
+            "[0-9０-９]":"N"
+          }
+          this.chartype_ = [];
+          for (var i in patterns) {
+            var regexp = new RegExp(i);
+            this.chartype_.push([regexp, patterns[i]]);
+          }
+
+          this.BIAS__ = -332
+          this.BC1__ = {"HH":6,"II":2461,"KH":406,"OH":-1378};
+          this.BC2__ = {"AA":-3267,"AI":2744,"AN":-878,"HH":-4070,"HM":-1711,"HN":4012,"HO":3761,"IA":1327,"IH":-1184,"II":-1332,"IK":1721,"IO":5492,"KI":3831,"KK":-8741,"MH":-3132,"MK":3334,"OO":-2920};
+          this.BC3__ = {"HH":996,"HI":626,"HK":-721,"HN":-1307,"HO":-836,"IH":-301,"KK":2762,"MK":1079,"MM":4034,"OA":-1652,"OH":266};
+          this.BP1__ = {"BB":295,"OB":304,"OO":-125,"UB":352};
+          this.BP2__ = {"BO":60,"OO":-1762};
+          this.BQ1__ = {"BHH":1150,"BHM":1521,"BII":-1158,"BIM":886,"BMH":1208,"BNH":449,"BOH":-91,"BOO":-2597,"OHI":451,"OIH":-296,"OKA":1851,"OKH":-1020,"OKK":904,"OOO":2965};
+          this.BQ2__ = {"BHH":118,"BHI":-1159,"BHM":466,"BIH":-919,"BKK":-1720,"BKO":864,"OHH":-1139,"OHM":-181,"OIH":153,"UHI":-1146};
+          this.BQ3__ = {"BHH":-792,"BHI":2664,"BII":-299,"BKI":419,"BMH":937,"BMM":8335,"BNN":998,"BOH":775,"OHH":2174,"OHM":439,"OII":280,"OKH":1798,"OKI":-793,"OKO":-2242,"OMH":-2402,"OOO":11699};
+          this.BQ4__ = {"BHH":-3895,"BIH":3761,"BII":-4654,"BIK":1348,"BKK":-1806,"BMI":-3385,"BOO":-12396,"OAH":926,"OHH":266,"OHK":-2036,"ONN":-973};
+          this.BW1__ = {",と":660,",同":727,"B1あ":1404,"B1同":542,"、と":660,"、同":727,"」と":1682,"あっ":1505,"いう":1743,"いっ":-2055,"いる":672,"うし":-4817,"うん":665,"から":3472,"がら":600,"こう":-790,"こと":2083,"こん":-1262,"さら":-4143,"さん":4573,"した":2641,"して":1104,"すで":-3399,"そこ":1977,"それ":-871,"たち":1122,"ため":601,"った":3463,"つい":-802,"てい":805,"てき":1249,"でき":1127,"です":3445,"では":844,"とい":-4915,"とみ":1922,"どこ":3887,"ない":5713,"なっ":3015,"など":7379,"なん":-1113,"にし":2468,"には":1498,"にも":1671,"に対":-912,"の一":-501,"の中":741,"ませ":2448,"まで":1711,"まま":2600,"まる":-2155,"やむ":-1947,"よっ":-2565,"れた":2369,"れで":-913,"をし":1860,"を見":731,"亡く":-1886,"京都":2558,"取り":-2784,"大き":-2604,"大阪":1497,"平方":-2314,"引き":-1336,"日本":-195,"本当":-2423,"毎日":-2113,"目指":-724,"Ｂ１あ":1404,"Ｂ１同":542,"｣と":1682};
+          this.BW2__ = {"..":-11822,"11":-669,"――":-5730,"−−":-13175,"いう":-1609,"うか":2490,"かし":-1350,"かも":-602,"から":-7194,"かれ":4612,"がい":853,"がら":-3198,"きた":1941,"くな":-1597,"こと":-8392,"この":-4193,"させ":4533,"され":13168,"さん":-3977,"しい":-1819,"しか":-545,"した":5078,"して":972,"しな":939,"その":-3744,"たい":-1253,"たた":-662,"ただ":-3857,"たち":-786,"たと":1224,"たは":-939,"った":4589,"って":1647,"っと":-2094,"てい":6144,"てき":3640,"てく":2551,"ては":-3110,"ても":-3065,"でい":2666,"でき":-1528,"でし":-3828,"です":-4761,"でも":-4203,"とい":1890,"とこ":-1746,"とと":-2279,"との":720,"とみ":5168,"とも":-3941,"ない":-2488,"なが":-1313,"など":-6509,"なの":2614,"なん":3099,"にお":-1615,"にし":2748,"にな":2454,"によ":-7236,"に対":-14943,"に従":-4688,"に関":-11388,"のか":2093,"ので":-7059,"のに":-6041,"のの":-6125,"はい":1073,"はが":-1033,"はず":-2532,"ばれ":1813,"まし":-1316,"まで":-6621,"まれ":5409,"めて":-3153,"もい":2230,"もの":-10713,"らか":-944,"らし":-1611,"らに":-1897,"りし":651,"りま":1620,"れた":4270,"れて":849,"れば":4114,"ろう":6067,"われ":7901,"を通":-11877,"んだ":728,"んな":-4115,"一人":602,"一方":-1375,"一日":970,"一部":-1051,"上が":-4479,"会社":-1116,"出て":2163,"分の":-7758,"同党":970,"同日":-913,"大阪":-2471,"委員":-1250,"少な":-1050,"年度":-8669,"年間":-1626,"府県":-2363,"手権":-1982,"新聞":-4066,"日新":-722,"日本":-7068,"日米":3372,"曜日":-601,"朝鮮":-2355,"本人":-2697,"東京":-1543,"然と":-1384,"社会":-1276,"立て":-990,"第に":-1612,"米国":-4268,"１１":-669};
+          this.BW3__ = {"あた":-2194,"あり":719,"ある":3846,"い.":-1185,"い。":-1185,"いい":5308,"いえ":2079,"いく":3029,"いた":2056,"いっ":1883,"いる":5600,"いわ":1527,"うち":1117,"うと":4798,"えと":1454,"か.":2857,"か。":2857,"かけ":-743,"かっ":-4098,"かに":-669,"から":6520,"かり":-2670,"が,":1816,"が、":1816,"がき":-4855,"がけ":-1127,"がっ":-913,"がら":-4977,"がり":-2064,"きた":1645,"けど":1374,"こと":7397,"この":1542,"ころ":-2757,"さい":-714,"さを":976,"し,":1557,"し、":1557,"しい":-3714,"した":3562,"して":1449,"しな":2608,"しま":1200,"す.":-1310,"す。":-1310,"する":6521,"ず,":3426,"ず、":3426,"ずに":841,"そう":428,"た.":8875,"た。":8875,"たい":-594,"たの":812,"たり":-1183,"たる":-853,"だ.":4098,"だ。":4098,"だっ":1004,"った":-4748,"って":300,"てい":6240,"てお":855,"ても":302,"です":1437,"でに":-1482,"では":2295,"とう":-1387,"とし":2266,"との":541,"とも":-3543,"どう":4664,"ない":1796,"なく":-903,"など":2135,"に,":-1021,"に、":-1021,"にし":1771,"にな":1906,"には":2644,"の,":-724,"の、":-724,"の子":-1000,"は,":1337,"は、":1337,"べき":2181,"まし":1113,"ます":6943,"まっ":-1549,"まで":6154,"まれ":-793,"らし":1479,"られ":6820,"るる":3818,"れ,":854,"れ、":854,"れた":1850,"れて":1375,"れば":-3246,"れる":1091,"われ":-605,"んだ":606,"んで":798,"カ月":990,"会議":860,"入り":1232,"大会":2217,"始め":1681,"市":965,"新聞":-5055,"日,":974,"日、":974,"社会":2024,"ｶ月":990};
+          this.TC1__ = {"AAA":1093,"HHH":1029,"HHM":580,"HII":998,"HOH":-390,"HOM":-331,"IHI":1169,"IOH":-142,"IOI":-1015,"IOM":467,"MMH":187,"OOI":-1832};
+          this.TC2__ = {"HHO":2088,"HII":-1023,"HMM":-1154,"IHI":-1965,"KKH":703,"OII":-2649};
+          this.TC3__ = {"AAA":-294,"HHH":346,"HHI":-341,"HII":-1088,"HIK":731,"HOH":-1486,"IHH":128,"IHI":-3041,"IHO":-1935,"IIH":-825,"IIM":-1035,"IOI":-542,"KHH":-1216,"KKA":491,"KKH":-1217,"KOK":-1009,"MHH":-2694,"MHM":-457,"MHO":123,"MMH":-471,"NNH":-1689,"NNO":662,"OHO":-3393};
+          this.TC4__ = {"HHH":-203,"HHI":1344,"HHK":365,"HHM":-122,"HHN":182,"HHO":669,"HIH":804,"HII":679,"HOH":446,"IHH":695,"IHO":-2324,"IIH":321,"III":1497,"IIO":656,"IOO":54,"KAK":4845,"KKA":3386,"KKK":3065,"MHH":-405,"MHI":201,"MMH":-241,"MMM":661,"MOM":841};
+          this.TQ1__ = {"BHHH":-227,"BHHI":316,"BHIH":-132,"BIHH":60,"BIII":1595,"BNHH":-744,"BOHH":225,"BOOO":-908,"OAKK":482,"OHHH":281,"OHIH":249,"OIHI":200,"OIIH":-68};
+          this.TQ2__ = {"BIHH":-1401,"BIII":-1033,"BKAK":-543,"BOOO":-5591};
+          this.TQ3__ = {"BHHH":478,"BHHM":-1073,"BHIH":222,"BHII":-504,"BIIH":-116,"BIII":-105,"BMHI":-863,"BMHM":-464,"BOMH":620,"OHHH":346,"OHHI":1729,"OHII":997,"OHMH":481,"OIHH":623,"OIIH":1344,"OKAK":2792,"OKHH":587,"OKKA":679,"OOHH":110,"OOII":-685};
+          this.TQ4__ = {"BHHH":-721,"BHHM":-3604,"BHII":-966,"BIIH":-607,"BIII":-2181,"OAAA":-2763,"OAKK":180,"OHHH":-294,"OHHI":2446,"OHHO":480,"OHIH":-1573,"OIHH":1935,"OIHI":-493,"OIIH":626,"OIII":-4007,"OKAK":-8156};
+          this.TW1__ = {"につい":-4681,"東京都":2026};
+          this.TW2__ = {"ある程":-2049,"いった":-1256,"ころが":-2434,"しょう":3873,"その後":-4430,"だって":-1049,"ていた":1833,"として":-4657,"ともに":-4517,"もので":1882,"一気に":-792,"初めて":-1512,"同時に":-8097,"大きな":-1255,"対して":-2721,"社会党":-3216};
+          this.TW3__ = {"いただ":-1734,"してい":1314,"として":-4314,"につい":-5483,"にとっ":-5989,"に当た":-6247,"ので,":-727,"ので、":-727,"のもの":-600,"れから":-3752,"十二月":-2287};
+          this.TW4__ = {"いう.":8576,"いう。":8576,"からな":-2348,"してい":2958,"たが,":1516,"たが、":1516,"ている":1538,"という":1349,"ました":5543,"ません":1097,"ようと":-4258,"よると":5865};
+          this.UC1__ = {"A":484,"K":93,"M":645,"O":-505};
+          this.UC2__ = {"A":819,"H":1059,"I":409,"M":3987,"N":5775,"O":646};
+          this.UC3__ = {"A":-1370,"I":2311};
+          this.UC4__ = {"A":-2643,"H":1809,"I":-1032,"K":-3450,"M":3565,"N":3876,"O":6646};
+          this.UC5__ = {"H":313,"I":-1238,"K":-799,"M":539,"O":-831};
+          this.UC6__ = {"H":-506,"I":-253,"K":87,"M":247,"O":-387};
+          this.UP1__ = {"O":-214};
+          this.UP2__ = {"B":69,"O":935};
+          this.UP3__ = {"B":189};
+          this.UQ1__ = {"BH":21,"BI":-12,"BK":-99,"BN":142,"BO":-56,"OH":-95,"OI":477,"OK":410,"OO":-2422};
+          this.UQ2__ = {"BH":216,"BI":113,"OK":1759};
+          this.UQ3__ = {"BA":-479,"BH":42,"BI":1913,"BK":-7198,"BM":3160,"BN":6427,"BO":14761,"OI":-827,"ON":-3212};
+          this.UW1__ = {",":156,"、":156,"「":-463,"あ":-941,"う":-127,"が":-553,"き":121,"こ":505,"で":-201,"と":-547,"ど":-123,"に":-789,"の":-185,"は":-847,"も":-466,"や":-470,"よ":182,"ら":-292,"り":208,"れ":169,"を":-446,"ん":-137,"・":-135,"主":-402,"京":-268,"区":-912,"午":871,"国":-460,"大":561,"委":729,"市":-411,"日":-141,"理":361,"生":-408,"県":-386,"都":-718,"｢":-463,"･":-135};
+          this.UW2__ = {",":-829,"、":-829,"〇":892,"「":-645,"」":3145,"あ":-538,"い":505,"う":134,"お":-502,"か":1454,"が":-856,"く":-412,"こ":1141,"さ":878,"ざ":540,"し":1529,"す":-675,"せ":300,"そ":-1011,"た":188,"だ":1837,"つ":-949,"て":-291,"で":-268,"と":-981,"ど":1273,"な":1063,"に":-1764,"の":130,"は":-409,"ひ":-1273,"べ":1261,"ま":600,"も":-1263,"や":-402,"よ":1639,"り":-579,"る":-694,"れ":571,"を":-2516,"ん":2095,"ア":-587,"カ":306,"キ":568,"ッ":831,"三":-758,"不":-2150,"世":-302,"中":-968,"主":-861,"事":492,"人":-123,"会":978,"保":362,"入":548,"初":-3025,"副":-1566,"北":-3414,"区":-422,"大":-1769,"天":-865,"太":-483,"子":-1519,"学":760,"実":1023,"小":-2009,"市":-813,"年":-1060,"強":1067,"手":-1519,"揺":-1033,"政":1522,"文":-1355,"新":-1682,"日":-1815,"明":-1462,"最":-630,"朝":-1843,"本":-1650,"東":-931,"果":-665,"次":-2378,"民":-180,"気":-1740,"理":752,"発":529,"目":-1584,"相":-242,"県":-1165,"立":-763,"第":810,"米":509,"自":-1353,"行":838,"西":-744,"見":-3874,"調":1010,"議":1198,"込":3041,"開":1758,"間":-1257,"｢":-645,"｣":3145,"ｯ":831,"ｱ":-587,"ｶ":306,"ｷ":568};
+          this.UW3__ = {",":4889,"1":-800,"−":-1723,"、":4889,"々":-2311,"〇":5827,"」":2670,"〓":-3573,"あ":-2696,"い":1006,"う":2342,"え":1983,"お":-4864,"か":-1163,"が":3271,"く":1004,"け":388,"げ":401,"こ":-3552,"ご":-3116,"さ":-1058,"し":-395,"す":584,"せ":3685,"そ":-5228,"た":842,"ち":-521,"っ":-1444,"つ":-1081,"て":6167,"で":2318,"と":1691,"ど":-899,"な":-2788,"に":2745,"の":4056,"は":4555,"ひ":-2171,"ふ":-1798,"へ":1199,"ほ":-5516,"ま":-4384,"み":-120,"め":1205,"も":2323,"や":-788,"よ":-202,"ら":727,"り":649,"る":5905,"れ":2773,"わ":-1207,"を":6620,"ん":-518,"ア":551,"グ":1319,"ス":874,"ッ":-1350,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278,"・":-3794,"一":-1619,"下":-1759,"世":-2087,"両":3815,"中":653,"主":-758,"予":-1193,"二":974,"人":2742,"今":792,"他":1889,"以":-1368,"低":811,"何":4265,"作":-361,"保":-2439,"元":4858,"党":3593,"全":1574,"公":-3030,"六":755,"共":-1880,"円":5807,"再":3095,"分":457,"初":2475,"別":1129,"前":2286,"副":4437,"力":365,"動":-949,"務":-1872,"化":1327,"北":-1038,"区":4646,"千":-2309,"午":-783,"協":-1006,"口":483,"右":1233,"各":3588,"合":-241,"同":3906,"和":-837,"員":4513,"国":642,"型":1389,"場":1219,"外":-241,"妻":2016,"学":-1356,"安":-423,"実":-1008,"家":1078,"小":-513,"少":-3102,"州":1155,"市":3197,"平":-1804,"年":2416,"広":-1030,"府":1605,"度":1452,"建":-2352,"当":-3885,"得":1905,"思":-1291,"性":1822,"戸":-488,"指":-3973,"政":-2013,"教":-1479,"数":3222,"文":-1489,"新":1764,"日":2099,"旧":5792,"昨":-661,"時":-1248,"曜":-951,"最":-937,"月":4125,"期":360,"李":3094,"村":364,"東":-805,"核":5156,"森":2438,"業":484,"氏":2613,"民":-1694,"決":-1073,"法":1868,"海":-495,"無":979,"物":461,"特":-3850,"生":-273,"用":914,"町":1215,"的":7313,"直":-1835,"省":792,"県":6293,"知":-1528,"私":4231,"税":401,"立":-960,"第":1201,"米":7767,"系":3066,"約":3663,"級":1384,"統":-4229,"総":1163,"線":1255,"者":6457,"能":725,"自":-2869,"英":785,"見":1044,"調":-562,"財":-733,"費":1777,"車":1835,"軍":1375,"込":-1504,"通":-1136,"選":-681,"郎":1026,"郡":4404,"部":1200,"金":2163,"長":421,"開":-1432,"間":1302,"関":-1282,"雨":2009,"電":-1045,"非":2066,"駅":1620,"１":-800,"｣":2670,"･":-3794,"ｯ":-1350,"ｱ":551,"ｸﾞ":1319,"ｽ":874,"ﾄ":521,"ﾑ":1109,"ﾙ":1591,"ﾛ":2201,"ﾝ":278};
+          this.UW4__ = {",":3930,".":3508,"―":-4841,"、":3930,"。":3508,"〇":4999,"「":1895,"」":3798,"〓":-5156,"あ":4752,"い":-3435,"う":-640,"え":-2514,"お":2405,"か":530,"が":6006,"き":-4482,"ぎ":-3821,"く":-3788,"け":-4376,"げ":-4734,"こ":2255,"ご":1979,"さ":2864,"し":-843,"じ":-2506,"す":-731,"ず":1251,"せ":181,"そ":4091,"た":5034,"だ":5408,"ち":-3654,"っ":-5882,"つ":-1659,"て":3994,"で":7410,"と":4547,"な":5433,"に":6499,"ぬ":1853,"ね":1413,"の":7396,"は":8578,"ば":1940,"ひ":4249,"び":-4134,"ふ":1345,"へ":6665,"べ":-744,"ほ":1464,"ま":1051,"み":-2082,"む":-882,"め":-5046,"も":4169,"ゃ":-2666,"や":2795,"ょ":-1544,"よ":3351,"ら":-2922,"り":-9726,"る":-14896,"れ":-2613,"ろ":-4570,"わ":-1783,"を":13150,"ん":-2352,"カ":2145,"コ":1789,"セ":1287,"ッ":-724,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637,"・":-4371,"ー":-11870,"一":-2069,"中":2210,"予":782,"事":-190,"井":-1768,"人":1036,"以":544,"会":950,"体":-1286,"作":530,"側":4292,"先":601,"党":-2006,"共":-1212,"内":584,"円":788,"初":1347,"前":1623,"副":3879,"力":-302,"動":-740,"務":-2715,"化":776,"区":4517,"協":1013,"参":1555,"合":-1834,"和":-681,"員":-910,"器":-851,"回":1500,"国":-619,"園":-1200,"地":866,"場":-1410,"塁":-2094,"士":-1413,"多":1067,"大":571,"子":-4802,"学":-1397,"定":-1057,"寺":-809,"小":1910,"屋":-1328,"山":-1500,"島":-2056,"川":-2667,"市":2771,"年":374,"庁":-4556,"後":456,"性":553,"感":916,"所":-1566,"支":856,"改":787,"政":2182,"教":704,"文":522,"方":-856,"日":1798,"時":1829,"最":845,"月":-9066,"木":-485,"来":-442,"校":-360,"業":-1043,"氏":5388,"民":-2716,"気":-910,"沢":-939,"済":-543,"物":-735,"率":672,"球":-1267,"生":-1286,"産":-1101,"田":-2900,"町":1826,"的":2586,"目":922,"省":-3485,"県":2997,"空":-867,"立":-2112,"第":788,"米":2937,"系":786,"約":2171,"経":1146,"統":-1169,"総":940,"線":-994,"署":749,"者":2145,"能":-730,"般":-852,"行":-792,"規":792,"警":-1184,"議":-244,"谷":-1000,"賞":730,"車":-1481,"軍":1158,"輪":-1433,"込":-3370,"近":929,"道":-1291,"選":2596,"郎":-4866,"都":1192,"野":-1100,"銀":-2213,"長":357,"間":-2344,"院":-2297,"際":-2604,"電":-878,"領":-1659,"題":-792,"館":-1984,"首":1749,"高":2120,"｢":1895,"｣":3798,"･":-4371,"ｯ":-724,"ｰ":-11870,"ｶ":2145,"ｺ":1789,"ｾ":1287,"ﾄ":-403,"ﾒ":-1635,"ﾗ":-881,"ﾘ":-541,"ﾙ":-856,"ﾝ":-3637};
+          this.UW5__ = {",":465,".":-299,"1":-514,"E2":-32768,"]":-2762,"、":465,"。":-299,"「":363,"あ":1655,"い":331,"う":-503,"え":1199,"お":527,"か":647,"が":-421,"き":1624,"ぎ":1971,"く":312,"げ":-983,"さ":-1537,"し":-1371,"す":-852,"だ":-1186,"ち":1093,"っ":52,"つ":921,"て":-18,"で":-850,"と":-127,"ど":1682,"な":-787,"に":-1224,"の":-635,"は":-578,"べ":1001,"み":502,"め":865,"ゃ":3350,"ょ":854,"り":-208,"る":429,"れ":504,"わ":419,"を":-1264,"ん":327,"イ":241,"ル":451,"ン":-343,"中":-871,"京":722,"会":-1153,"党":-654,"務":3519,"区":-901,"告":848,"員":2104,"大":-1296,"学":-548,"定":1785,"嵐":-1304,"市":-2991,"席":921,"年":1763,"思":872,"所":-814,"挙":1618,"新":-1682,"日":218,"月":-4353,"査":932,"格":1356,"機":-1508,"氏":-1347,"田":240,"町":-3912,"的":-3149,"相":1319,"省":-1052,"県":-4003,"研":-997,"社":-278,"空":-813,"統":1955,"者":-2233,"表":663,"語":-1073,"議":1219,"選":-1018,"郎":-368,"長":786,"間":1191,"題":2368,"館":-689,"１":-514,"Ｅ２":-32768,"｢":363,"ｲ":241,"ﾙ":451,"ﾝ":-343};
+          this.UW6__ = {",":227,".":808,"1":-270,"E1":306,"、":227,"。":808,"あ":-307,"う":189,"か":241,"が":-73,"く":-121,"こ":-200,"じ":1782,"す":383,"た":-428,"っ":573,"て":-1014,"で":101,"と":-105,"な":-253,"に":-149,"の":-417,"は":-236,"も":-206,"り":187,"る":-135,"を":195,"ル":-673,"ン":-496,"一":-277,"中":201,"件":-800,"会":624,"前":302,"区":1792,"員":-1212,"委":798,"学":-960,"市":887,"広":-695,"後":535,"業":-697,"相":753,"社":-507,"福":974,"空":-822,"者":1811,"連":463,"郎":1082,"１":-270,"Ｅ１":306,"ﾙ":-673,"ﾝ":-496};
+          
+          return this;
+        }
+        TinySegmenter.prototype.ctype_ = function(str) {
+          for (var i in this.chartype_) {
+            if (str.match(this.chartype_[i][0])) {
+              return this.chartype_[i][1];
+            }
+          }
+          return "O";
+        }
+
+        TinySegmenter.prototype.ts_ = function(v) {
+          if (v) { return v; }
+          return 0;
+        }
+
+        TinySegmenter.prototype.segment = function(input) {
+          if (input == null || input == undefined || input == "") {
+            return [];
+          }
+          var result = [];
+          var seg = ["B3","B2","B1"];
+          var ctype = ["O","O","O"];
+          var o = input.split("");
+          for (i = 0; i < o.length; ++i) {
+            seg.push(o[i]);
+            ctype.push(this.ctype_(o[i]))
+          }
+          seg.push("E1");
+          seg.push("E2");
+          seg.push("E3");
+          ctype.push("O");
+          ctype.push("O");
+          ctype.push("O");
+          var word = seg[3];
+          var p1 = "U";
+          var p2 = "U";
+          var p3 = "U";
+          for (var i = 4; i < seg.length - 3; ++i) {
+            var score = this.BIAS__;
+            var w1 = seg[i-3];
+            var w2 = seg[i-2];
+            var w3 = seg[i-1];
+            var w4 = seg[i];
+            var w5 = seg[i+1];
+            var w6 = seg[i+2];
+            var c1 = ctype[i-3];
+            var c2 = ctype[i-2];
+            var c3 = ctype[i-1];
+            var c4 = ctype[i];
+            var c5 = ctype[i+1];
+            var c6 = ctype[i+2];
+            score += this.ts_(this.UP1__[p1]);
+            score += this.ts_(this.UP2__[p2]);
+            score += this.ts_(this.UP3__[p3]);
+            score += this.ts_(this.BP1__[p1 + p2]);
+            score += this.ts_(this.BP2__[p2 + p3]);
+            score += this.ts_(this.UW1__[w1]);
+            score += this.ts_(this.UW2__[w2]);
+            score += this.ts_(this.UW3__[w3]);
+            score += this.ts_(this.UW4__[w4]);
+            score += this.ts_(this.UW5__[w5]);
+            score += this.ts_(this.UW6__[w6]);
+            score += this.ts_(this.BW1__[w2 + w3]);
+            score += this.ts_(this.BW2__[w3 + w4]);
+            score += this.ts_(this.BW3__[w4 + w5]);
+            score += this.ts_(this.TW1__[w1 + w2 + w3]);
+            score += this.ts_(this.TW2__[w2 + w3 + w4]);
+            score += this.ts_(this.TW3__[w3 + w4 + w5]);
+            score += this.ts_(this.TW4__[w4 + w5 + w6]);
+            score += this.ts_(this.UC1__[c1]);
+            score += this.ts_(this.UC2__[c2]);
+            score += this.ts_(this.UC3__[c3]);
+            score += this.ts_(this.UC4__[c4]);
+            score += this.ts_(this.UC5__[c5]);
+            score += this.ts_(this.UC6__[c6]);
+            score += this.ts_(this.BC1__[c2 + c3]);
+            score += this.ts_(this.BC2__[c3 + c4]);
+            score += this.ts_(this.BC3__[c4 + c5]);
+            score += this.ts_(this.TC1__[c1 + c2 + c3]);
+            score += this.ts_(this.TC2__[c2 + c3 + c4]);
+            score += this.ts_(this.TC3__[c3 + c4 + c5]);
+            score += this.ts_(this.TC4__[c4 + c5 + c6]);
+        //  score += this.ts_(this.TC5__[c4 + c5 + c6]);    
+            score += this.ts_(this.UQ1__[p1 + c1]);
+            score += this.ts_(this.UQ2__[p2 + c2]);
+            score += this.ts_(this.UQ3__[p3 + c3]);
+            score += this.ts_(this.BQ1__[p2 + c2 + c3]);
+            score += this.ts_(this.BQ2__[p2 + c3 + c4]);
+            score += this.ts_(this.BQ3__[p3 + c2 + c3]);
+            score += this.ts_(this.BQ4__[p3 + c3 + c4]);
+            score += this.ts_(this.TQ1__[p2 + c1 + c2 + c3]);
+            score += this.ts_(this.TQ2__[p2 + c2 + c3 + c4]);
+            score += this.ts_(this.TQ3__[p3 + c1 + c2 + c3]);
+            score += this.ts_(this.TQ4__[p3 + c2 + c3 + c4]);
+            var p = "O";
+            if (score > 0) {
+              result.push(word);
+              word = "";
+              p = "B";
+            }
+            p1 = p2;
+            p2 = p3;
+            p3 = p;
+            word += seg[i];
+          }
+          result.push(word);
+
+          return result;
+        }
+
+        lunr.TinySegmenter = TinySegmenter;
+    };
+
+}));
\ No newline at end of file
diff --git a/assets/javascripts/lunr/wordcut.js b/assets/javascripts/lunr/wordcut.js
new file mode 100644
index 0000000..0d898c9
--- /dev/null
+++ b/assets/javascripts/lunr/wordcut.js
@@ -0,0 +1,6708 @@
+(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}(g.lunr || (g.lunr = {})).wordcut = f()}})(function(){var define,module,exports;return (function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s})({1:[function(require,module,exports){
+var _ = require("underscore");
+
+var Acceptors = {
+  creators: null,
+  current: null,
+  tag: null,
+
+  init: function() {
+    this.creators = [];
+    this.current = [];
+    this.tag = {};
+  },
+
+  reset: function() {
+    this.current = [];
+    this.tag = {}
+  },
+
+  transit: function(ch) {
+    var self = this;
+
+    self.creators.forEach(function(creator) {
+      var acceptor = creator.createAcceptor(self.tag);
+      if (acceptor) 
+        self.current.push(acceptor);
+    });
+    
+    var _current = [];
+    self.tag = {};
+
+    for (var i = 0; i < self.current.length; i++) {
+      var _acceptor = self.current[i]
+        , acceptor = _acceptor.transit(ch);
+      
+      if (!acceptor.isError) {
+        _current.push(acceptor);
+        self.tag[acceptor.tag] = acceptor;
+      }
+    }
+    self.current = _current;
+
+  },
+
+  getFinalAcceptors: function() {    
+    return this.current.filter(function(acceptor) {
+      return acceptor.isFinal;
+    });
+  }
+};
+
+module.exports = function() {
+  var acceptors = _.clone(Acceptors);
+  acceptors.init();
+  return acceptors;
+};
+
+},{"underscore":25}],2:[function(require,module,exports){
+(function (__dirname){
+
+var LEFT = 0;
+var RIGHT = 1;
+var path = require("path");
+var glob = require("glob");
+
+var WordcutDict = {
+
+
+  init: function (dictPathFile, withDefault, words) {
+    withDefault = withDefault || false
+    var defaultDict = path.normalize(__dirname + "/..") + "/data/tdict-*.txt";
+    this.dict=[]
+    var dictPathIsDefined = dictPathFile !== undefined
+    var dictPath = (withDefault || !dictPathIsDefined) ? [defaultDict]: [];
+    var dictPathFile = dictPathFile || defaultDict
+
+    if(dictPathIsDefined){
+      if (Array.isArray(dictPathFile)) {
+        dictPath.concat.apply(dictPath, dictPathFile);
+      } else {
+        dictPath.push(dictPathFile)
+      }
+    }
+
+    this.addFiles(dictPath, false)
+
+    if(words!==undefined){
+      this.addWords(words, false)
+    }
+    this.finalizeDict();
+  },
+
+  addWords: function(words, finalize){
+    finalize = finalize===undefined || finalize;
+    this.dict.push.apply(this.dict, words)
+    if(finalize){
+      this.finalizeDict();
+    }
+  },
+
+  finalizeDict: function(){
+    this.dict = this.sortuniq(this.dict);
+  },
+
+  addFiles: function(files, finalize){
+    finalize = finalize===undefined || finalize;
+    
+    for (var i = 0; i < 1; i++) {
+      var words = "ก.ก.\nก.ก.น.\nก.ข.ค.\nก.ค.\nก.จ.\nก.ช.น.\nก.ฌ.\nก.ต.\nก.ต.ง.\nก.ต.ช.\nก.ตร.\nก.ท.\nก.น.ช.\nก.บช.\nก.บถ.\nก.ป.ส.\nก.พ.\nก.ม.\nก.ย.\nก.ร.\nก.ล.ต.\nก.ว.\nก.ศ.ว.\nก.ส.ท.\nก.ส.ธ.\nก.ส.อ.\nก.อ.\nกก.ตชด.\nกก.ตร.น.\nกก.ภ.จว.\nกก.รสช.\nกกบ.ขส.ทบ.\nกกล.รพน.\nกง.กห.\nกง.ทบ.\nกง.ทร.\nกซข.ป.\nกซม.ป.\nกทม.กรุงเทพมหานคร\nกบ.ทบ.\nกป.สป.\nกพ.ทบ.\nกพ.ทร.\nกพ.ทหาร\nกร.ทบ.\nกรป.กลาง\nกรอ.พอ.\nกศ.ด.\nกศ.บ.\nกศ.บป.\nกศ.ม.\nกษ.ด.\nกษ.บ.\nกษ.ม.\nกส.ด.\nกส.ทบ.\nกส.บ.\nกส.ม.\nกอ.ปค.\nกอ.รพน.\nกอ.รมน.\nกอ.รสต.\nข.ต.ว.\nขว.ทบ.\nขว.ทร.\nขว.ทหาร\nขส.ทบ.\nขส.ทร.\nขส.ทอ.\nค.ด.\nค.บ.\nค.พ.ศ.\nค.ม.\nค.ร.น.\nค.ร.ฟ.\nค.ร.ม.\nค.ศ.\nค.อ.ด.\nค.อ.บ.\nค.อ.ม.\nคศ.ด.\nคศ.บ.\nคศ.ม.\nง.ด.\nจ.จ.\nจ.จ.จ.\nจ.ช.\nจ.ต.\nจ.ท.\nจ.ป.ร.\nจ.ม.\nจ.ศ.\nจ.ส.ต.\nจ.ส.ท.\nจ.ส.อ.\nจ.อ.\nจ.อ.ร.\nจ.๑๘\nจก.ธน.\nจก.สน.\nช.ค.\nช.ค.บ.\nช.พ.ค.\nช.ส.\nช.ส.ค.\nฌ.ป.ค.\nฌ.ศ.ร.\nฌ.ส.อ.\nฐท.สห.\nด.ช.\nด.ญ.\nด.ต.\nด.ศ.ค.\nด.ศ.ร.\nดย.ทร.\nต.ก.\nต.ค.\nต.จ.\nต.จ.ว.\nต.ช.\nต.ต.\nต.บ.\nต.ม.\nต.ร.\nต.ศ.ร.\nต.ห.\nต.อ.\nต.อ.จ.\nตร.กม.\nตร.ซม.\nตร.ต.\nตร.ทล.\nตร.น.\nตร.ปม.\nตร.ภ.\nตร.ม.\nตร.รฟ.\nตร.ว.\nตร.ส.\nตร.สข.\nท.จ.\nท.จ.ว.\nท.ช.\nท.ญ.\nท.ด.\nท.ท.ท.\nท.ทบ.\nท.บ.\nท.พ.\nท.ม.\nท.ศ.\nทก.ด.\nทก.บ.\nทก.ม.\nทส.ปช.\nทส.รมว.กห.\nทุ.ส.นิ.ม.\nธ.ก.ส.\nธ.ค.\nธ.ญ\nธ.บ.\nน.ช.\nน.ญ.\nน.ด.\nน.ต.\nน.ท.\nน.น.\nน.บ.\nน.บ.ท.\nน.ป.ท.\nน.พ.\nน.ม.\nน.ร.\nน.ว.\nน.ศ.\nน.ส.\nน.ส.พ.\nน.ส.๓\nน.สพ.\nน.อ.\nนปพ.ภ.\nนศ.ด.\nนศ.บ.\nนศ.ม.\nบ.ก.\nบ.ข.ส.\nบ.ช.\nบ.ด.ท.\nบ.ตร.\nบ.ภ.\nบ.ม.\nบก.จร.\nบก.ตชด.\nบก.ตม.\nบก.ทล.\nบก.น.\nบก.ป.\nบก.ปค.\nบก.ปม.\nบก.ภ.เขต\nบก.รน.\nบก.รฟ.\nบก.ร้อย.ตชด.\nบก.ส.\nบกข.ป.\nบจพ.ป.\nบช.ก.\nบช.ด.\nบช.ตชด.\nบช.น.\nบช.บ.\nบช.ปส.\nบช.ภ.\nบช.ม.\nบชท.ป.\nบชน.ป.\nบชส.ป.\nบธ.ด.\nบธ.บ.\nบธ.ม.\nบนท.ป.\nบนอ.ป.\nบปช.ป.\nป.กท.\nป.กศ.\nป.กศ.สูง\nป.จ.\nป.จ.ว.\nป.ช.\nป.ธ.\nป.ป.\nป.ป.ก.\nป.ป.ช.\nป.ป.ป.\nป.ป.ร.\nป.ป.ส.\nป.พ.\nป.พ.พ.\nป.พย.\nป.ม.\nป.ม.ก.\nป.ม.ช.\nป.ม.ธ.\nป.ม.ศ.\nป.ม.อ.\nป.ร.ร.๔\nป.ร.ร.๕\nป.ร.ร.๖\nป.ล.\nป.ว.พ.\nป.วิ.อ.\nป.ส.ส.\nป.อ.\nป.อ.ร.ส.\nป.๑\nปม.วส.\nปอ.พ.\nผกก.ภ.\nผช.ผอ.\nผต.มท.\nผบ.ตร.\nผบ.ทบ.\nผบ.ทร.\nผบ.ทสส.\nผบ.ทอ.\nผบก.น.\nผบก.ป.\nผบก.ปค.\nผบก.ปม.\nผบก.ภ.\nผบช.ก.\nผบช.ตชด.\nผบช.น.\nผบช.ภ.\nผว.กทม.\nผอ.ปจ.\nพ.ก.ง.\nพ.กศ.\nพ.ข.ต.\nพ.ค.\nพ.ค.ช.\nพ.ค.ว.\nพ.ค.ศ.\nพ.จ.ต.\nพ.จ.ท.\nพ.จ.อ.\nพ.ช.\nพ.ช.ค.\nพ.ด.\nพ.ต.\nพ.ต.ต.\nพ.ต.ท.\nพ.ต.อ.\nพ.ต.อ.พิเศษ\nพ.ท.\nพ.บ.\nพ.ป.\nพ.ภ.ม.\nพ.ม.\nพ.ม.ช.\nพ.ย.\nพ.ร.ก.\nพ.ร.ฎ.\nพ.ร.ต.\nพ.ร.ธ.\nพ.ร.บ.\nพ.ศ.\nพ.ศ.บ.\nพ.ส.ร.\nพ.ส.ล.\nพ.อ.\nพ.อ.ต.\nพ.อ.ท.\nพ.อ.พิเศษ\nพ.อ.อ.\nพณ.ด.\nพณ.บ.\nพณ.ม.\nพธ.ด.\nพธ.บ.\nพธ.ม.\nพบ.ด.\nพบ.บ.\nพบ.ม.\nพย.ด.\nพย.บ.\nพย.ม.\nพล.จ.\nพล.ต.\nพล.ต.จ.\nพล.ต.ต.\nพล.ต.ท.\nพล.ต.อ.\nพล.ท.\nพล.ปตอ.\nพล.ม.\nพล.ม.๒\nพล.ร.จ.\nพล.ร.ต.\nพล.ร.ท.\nพล.ร.อ.\nพล.อ.\nพล.อ.จ.\nพล.อ.ต.\nพล.อ.ท.\nพล.อ.อ.\nพลา.ทร.\nพศ.ด.\nพศ.บ.\nพศ.ม.\nพอ.สว.\nภ.ง.ด.\nภ.ง.ด.๙\nภ.ด.\nภ.บ.\nภ.บ.ท.๕\nภ.ป.ร.\nภ.พ.\nภ.ม.\nภ.สถ.บ.\nม.ค.\nม.จ.\nม.ป.ท.\nม.ป.ป.\nม.ป.พ.\nม.ร.ว.\nม.ศ.\nม.อ.\nม.อ.ปัตตานี\nมิ.ย.\nมี.ค.\nยศ.ทบ.\nยศ.ทร.\nยศ.ทอ.\nร.ง.\nร.ด.\nร.ต.\nร.ต.ต.\nร.ต.ท.\nร.ต.อ.\nร.ท.\nร.น.\nร.บ.\nร.พ.\nร.ฟ.ล.\nร.ย.ล.\nร.ย.ส.ท.\nร.ล.\nร.ศ.\nร.ส.พ.\nร.อ.\nรป.ม.\nรร.จปร.\nรร.จอ.\nรร.ชท.\nรร.ตท.\nรร.นร.\nรร.นรต.\nรร.นอ.\nล.ญ.\nล.ว.\nลส.ชบ.\nว.ค.\nว.ฉ.\nว.ช.\nว.ด.ป.\nว.ป.ถ.\nวท.บ.\nศ.บ.\nศ.ป.ก.\nศ.ศ.ป.\nศฝร.ภ.\nศศ.บ.\nศษ.บ.\nศส.บ.\nส.ก.\nส.ก.ศ.ท.\nส.ค.\nส.ค.1\nส.ค.ร.\nส.ค.ส.\nส.ต.\nส.ต.ต.\nส.ต.ท.\nส.ต.อ.\nส.ท.\nส.ทร.\nส.ป.ช.\nส.ป.ส.ท.\nส.ป.อ.\nส.ร.\nส.ล.น.\nส.ว.\nส.ว.ท.\nส.ว.ส.ท.\nส.ส.\nส.ส.ท.\nส.ส.ร.\nส.ห.\nส.อ.\nสถ.บ.\nสนง.สสอ.\nสพ.ญ.\nสพ.บ.\nสว.จร.\nสว.ธร.\nสว.ส.\nสว.สป.\nสว.สส.\nสว.อก.\nสส.บ.\nสุ.จิ.ปุ.ลิ.\nห.ร.ม.\nอ.ก.ค.\nอ.ก.จ.\nอ.จ.\nอ.ช.พ.\nอ.ตร.\nอ.บ.\nอ.ส.ท.\nอ.ส.ม.ท.\nอ.ส.ย.\nอ.อ.ป.\nอส.รด.\nอุ.อา.ก.ส.\nฮ.จ.\nฮ.ท.\nฮ.ฝ.\nฮ.ล.\nฮ.ศ.\nเม.ย.\n\nกรีนิช\nกลันตัน\nกัลกัตตา\nกัวลาลัมเปอร์\nกัศมีร์\nกาฐมาณฑุ\nโกลกาตา\nควิเบก\nคอนเนตทิคัต\nคาบูล\nคุชราต\nคุนหมิง\nเคนตักกี\nเคนทักกี\nเคมบริดจ์\nแคชเมียร์\nแคนซัส\nแคนเบอร์รา\nแคโรไลนา\nแคลิฟอร์เนีย\nโคเปนเฮเกน\nโคลัมโบ\nโคโลราโด\nไครสต์เชิร์ช\nไคโร\nจาการ์ตา\nจำปาศักดิ์\nเจนไน\nเจนีวา\nเจ้อเจียง\nฉงชิ่ง\nเฉิงตู\nชานตง\nชิคาโก\nเชนไน\nเชอร์โนบิล\nซัปโปโร\nซานมารีโน\nซาบาห์\nซาราเยโว\nซาราวัก\nซิดนีย์\nซีอาน\nซีแอตเทิล\nซูริก\nซูริค\nเซเชลส์\nเซนได\nเซี่ยงไฮ้\nโซเฟีย\nโซล\nโซโลมอน\nไซ่ง่อน\nไซบีเรีย\nดัลลัส\nดาโคตา\nดานัง\nดีทรอยต์\nดูไบ\nเดนเวอร์\nเดลาแวร์\nเดียนเบียนฟู\nโดเวอร์\nโดฮา\nไดฟุกุ\nไดฟูกุ\nตรังกานู\nตริโปลี\nตูวาลู\nเตหะราน\nโตเกียว\nโตรอนโต\nทมิฬนาฑู\nทริโปลี\nทิเบต\nเทกซัส\nเท็กซัส\nเทนเนสซี\nเทลอาวีฟ\nแทสเมเนีย\nโทรอนโต\nไทเป\nธากา\nนางาซากิ\nนาริตะ\nนิวเจอร์ซีย์\nนิวเดลี\nนิวยอร์ก\nนิวยอร์ค\nนิวแฮมป์เชียร์\nเนบราสกา\nเนแบรสกา\nเนวาดา\nบรัสเซลส์\nบราซิเลีย\nบอมเบย์\nบอสตัน\nบังกาลอร์\nบังคาลอร์\nบูคาเรสต์\nบูดาเปสต์\nเบงกาซี\nเบนกาซี\nเบรุต\nเบลเกรด\nเบอร์ลิน\nแบกแดด\nปอยเปต\nปะลิส\nปะหัง\nปักกิ่ง\nปัญจาบ\nปัฏนา\nปารีส\nปีนัง\nเประ\nเปียงยาง\nพนมเปญ\nพระตะบอง\nพะโค\nพะสิม\nพาราณสี\nพิหารี\nเพนซิลวาเนีย\nเพนซิลเวเนีย\nฟรานซ์\nฟลอริดา\nฟิลาเดลเฟีย\nฟุกุชิมะ\nฟุกุชิมา\nฟุกุโอกะ\nฟูกูโอกะ\nแฟรงก์เฟิร์ต\nมอสโก\nมะนิลา\nมะละแหม่ง\nมัณฑะเลย์\nมัทราส\nมาดริด\nมิชิแกน\nมินนิโซตา\nมิยางิ\nมิลาน\nมิวนิก\nมิสซูรี\nมุมไบ\nเมน\nเมลเบิร์น\nเมาะตะมะ\nเมาะลำเลิง\nแมนจูเรีย\nแมนเชสเตอร์\nแมนฮัตตัน\nแมริแลนด์\nแมรีแลนด์\nแมสซาชูเซตส์\nยะไข่\nย่างกุ้ง\nยูทาห์\nยูนนาน\nเยรูซาเล็ม\nโยโกฮามา\nริยาด\nรีโอเดจาเนโร\nโรดไอแลนด์\nลอนดอน\nลอสแองเจลิส\nลาปาซ\nลาสเวกัส\nลิสบอน\nลุยเซียนา\nโลซาน\nโลซานน์\nวอชิงตัน\nวอร์ซอ\nวิสคอนซิน\nเวนิส\nเวลส์\nเวอร์จิเนีย\nเวอร์มอนต์\nเวียงจันทน์\nเวียนนา\nแวนคูเวอร์\nไวโอมิง\nสกอตแลนด์\nสก็อตแลนด์\nสตอกโฮล์ม\nสลังงอร์\nเสฉวน\nเสียมราฐ\nเสียมเรียบ\nหงสา\nหงสาวดี\nหนานไห่\nหลวงพระบาง\nหูเป่ย\nหูเป่ย์\nหูหนาน\nเหอเป่ย\nเหอเป่ย์\nเหอหนาน\nอชันตา\nอลาสกา\nอวันตี\nออริกอน\nออสโล\nอะแลสกา\nอัตตะปือ\nอัมมาน\nอัมสเตอร์ดัม\nอัสสัม\nอาบูดาบี\nอาร์คันซอ\nอินเดียนา\nอิบารากิ\nอิลลินอยส์\nอิสตันบูล\nอิสลามาบัด\nอุรุมชี\nอูลานบาตอร์\nเอดินบะระ\nเอเธนส์\nแอตแลนตา\nแอริโซนา\nแอลเจียร์\nโอคลาโฮมา\nโอค็อตสค์\nโอกินาวา\nโอซากา\nโอริสสา\nโอเรกอน\nโอไฮโอ\nไอดาโฮ\nไอโอวา\nฮอนโนลูลู\nฮานอย\nฮาเนดะ\nฮาราเร\nฮาวาย\nฮิโรชิมา\nฮุสตัน\nเฮลซิงกิ\n\nมกรา\nกุมภา\nมีนา\nเมษา\nพฤษภา\nมิถุนา\nกรกฎา\nสิงหา\nกันยา\nตุลา\nพฤศจิกา\nธันวา\nเอ\nบี\nซี\nดี\nอี\nเอฟ\nจี\nเอช\nไอ\nเจ\nเค\nแอล\nเอ็ม\nเอ็น\nโอ\nพี\nคิว\nอาร์\nเอส\nที\nยู\nวี\nดับเบิล\nดับบลิว\nเอ็กซ์\nเอ๊กซ์\nวาย\nแซด\nแอลฟา\nแอลฟ่า\nเบตา\nเบต้า\nแกมมา\nแกมม่า\nเดลตา\nเดลต้า\nโอเมกา\nโอเมก้า\nเมกะ\nกิกะ\nนาโน\nไมโคร\n\nกรรมาชน\nกรอบรูป\nกระดี๊กระด๊า\nกระบับ\nกราวนด์\nกรีน\nกรุ๊ป\nกฤษณ์\nกลาส\nก๊วน\nกษัตริยา\nกษัตริยาธิราช\nก่อนหน้า\nกะบับ\nกับดัก\nกัมมันตะ\nก๊าก\nก๋ากั่น\nกาญจน์\nกาญจนาภิเษก\nกามิกาเซ่\nการันตี\nกาหลิบ\nกิฟท์\nกิมจิ\nกีวี\nกึ๊ก\nกึ๋ย\nกุนซือ\nกุมภาพันธ์\nกู๋\nเกจิ\nเกมส์\nเกย์\nเกรด\nเกรย์\nเกสต์เฮาส์\nเก๊ะ\nเก๋ากี้\nเกิร์ล\nแกงค์\nแกรนด์\nแกสโซฮอล์\nแก๊สโซฮอล์\nโกเต็กซ์\nโกลด์\nโกะ\nโก๊ะ\nไกด์\nขั้นตอน\nเขวี้ยง\nคณาญาติ\nครัวซอง\nครัวซองต์\nคร่ำครวญ\nครีเอทีฟ\nครูเสด\nคลับ\nคลาสสิก\nคลิตอริส\nคลิป\nความหมาย\nควิก\nควีน\nคองเกรส\nคอนซูเมอร์\nคอนเซปต์\nคอนเซ็ปต์\nคอนโด\nคอนโดมิเนียม\nคอนเทนเนอร์\nคอนแทค\nคอนแท็ค\nคอนโทรล\nคอนเฟิร์ม\nคอปเตอร์\nคอมพ์\nคอมเพล็กซ์\nคอมมอนส์\nคอมเมนท์\nคอมเมนต์\nคอร์ป\nคอร์ปอเรชั่น\nคอร์รัปชัน\nคอร์รัปชั่น\nคอรัปชัน\nคอรัปชั่น\nคอร์ส\nคอลเล็กชั่น\nคอลัมน์\nคอลัมนิสต์\nคัตเอาต์\nคันคาก\nคันถธุระ\nคันธาระ\nคันยิ\nคัสตาร์ด\nคาราโอเกะ\nคีตกวี\nคีตปฏิภาณ\nคีตราชัน\nคาปูชิโน\nคามิคาเซ่\nคาเฟ่\nคาร์\nคาร์โก้\nคาราเมล\nคาแรกเตอร์\nคาแร็กเตอร์\nคาแรคเตอร์\nคาแร็คเตอร์\nคาวบอย\nคาสิโน\nคิกขุ\nคิวบิก\nคูลเลอร์\nเคบับ\nเครป\nเคลม\nเคลียร์\nเคลื่อนย้าย\nเคส\nเคอร์ฟิว\nแคชเชียร์\nแคทวอล์ค\nแคนดิเดต\nแคนตาลูป\nแคนยอน\nแคนู\nแคป\nแคมป์\nแคมปัส\nแคมเปญ\nแคร์\nแครกเกอร์\nแคร็กเกอร์\nแครอท\nแคสต์\nแคสติง\nแคสติ้ง\nโค้ก\nโค้ช\nโคโยตี\nโคโยตี้\nโครนา\nโคอะล่า\nโคอาลา\nโคอาล่า\nไคลแมกซ์\nไคลแม็กซ์\nงั้น\nง่าว\nงี้\nเง็ง\nโง่เขลา\nไง\nจตุคาม\nจ๊อกกี้\nจอหงวน\nจังโก้\nจัมโบ้\nจ๊าบ\nจารกรรม\nจารชน\nจิ๊ก\nจิ๊กโก๋\nจิ๊กซอว์\nจิตพิสัย\nจิตเภท\nจีดีพี\nจึ๊ก\nจุ๊ย\nจูน\nจูเนียร์\nเจ๊\nเจได\nเจ็ต\nเจล\nเจ๊าะแจ๊ะ\nเจี๊ยว\nแจ็กเก็ต\nแจ๊กเก็ต\nแจ็กพอต\nแจ็กพ็อต\nแจ๊กพอต\nแจ๊กพ็อต\nแจม\nแจ๊ส\nโจ๋\nฉลุย\nเฉิ่ม\nชนะเลิศ\nช็อค\nช็อต\nช็อป\nช็อปปิ้ง\nช็อปเปอร์\nชะโนด\nชัตเตอร์\nชัวร์\nชาร์จ\nชาร์ต\nชาร์ป\nชินบัญชร\nชิฟฟอน\nชีส\nชีอะห์\nเช็ก\nเช็งเม้ง\nเชฟ\nเชลียร์\nเชอร์รี่\nแชเชือน\nแช่แข็ง\nแชมป์\nแชมปิยอง\nแชมเปญ\nแชมเปี้ยน\nแชมพู\nโชว์รูม\nโชห่วย\nใช้งาน\nไชน่า\nซ้อ\nซอมบี้\nซะ\nซังเต\nซันตาคลอส\nซัพพลาย\nซัพพลายเออร์\nซัมเมอร์\nซากุระ\nซาดิสต์\nซาดิสม์\nซาตาน\nซานตาคลอส\nซาฟารี\nซาบะ\nซามูไร\nซาร์\nซาร์ดีน\nซาเล้ง\nซิง\nซิ่ง\nซิงเกิล\nซิตี\nซิตี้\nซินโดรม\nซิม\nซิ้ม\nซิมโฟนี\nซิมโฟนี่\nซิลเวอร์\nซี้\nซี้ซั้ว\nซีดาน\nซีน\nซีนีเพล็กซ์\nซีเนียร์\nซีร็อกซ์\nซีรีส์\nซีเรียส\nซีอีโอ\nซื่อบื้อ\nซุนหนี่\nซุปเปอร์\nซูชิ\nซูเปอร์\nซูม\nซูโม่\nซูเอี๋ย\nซูฮก\nเซ็กซ์\nเซ็กซี่\nเซ็กส์\nเซนเซอร์\nเซ็นเซอร์\nเซนเตอร์\nเซ็นเตอร์\nเซ็นทรัล\nเซนส์\nเซ่นไหว้\nเซฟตี้\nเซรามิก\nเซลส์\nเซลส์แมน\nเซอร์\nเซอร์ไพรส์\nเซอร์วิส\nเซาท์\nเซี้ยว\nแซ็ก\nแซกโซโฟน\nแซ็กโซโฟน\nแซนด์วิช\nแซมบ้า\nแซลมอน\nแซว\nโซเชียล\nโซน\nโซนี่\nโซลาร์\nโซโล\nโซโล่\nญาณทัสสนะ\nดยุก\nดยุค\nดร็อป\nดรัมเมเยอร์\nดรามา\nดราม่า\nดอกเตอร์\nด็อกเตอร์\nดัมพ์\nดั๊มพ์\nดาวน์\nดิกชันนารี\nดิสเครดิต\nดีกรี\nดีเจ\nดีไซน์\nดีไซน์เนอร์\nดีไซเนอร์\nดีเบต\nดีพาร์ตเมนต์\nดีพาร์ตเมนท์\nดีพาร์ทเมนต์\nดีพาร์ทเมนท์\nดีมานด์\nดีล\nดีลเลอร์\nดีเลย์\nเดชานุภาพ\nเดบิต\nเดโม\nเดย์\nเด้อ\nเดอะ\nเด๊ะ\nเดี้ยง\nเดี๊ยะ\nแดนซ์\nแดนเซอร์\nแดรี่\nโดนัท\nโดมิโน\nโดรายากิ\nไดเอ็ต\nตถตา\nตนเอง\nตรวจทาน\nตรวจสอบ\nตอกย้ำ\nต๊อง\nต่อยอด\nต่อรอง\nตะหงิด\nตังค์\nตันเถียน\nตัวตน\nตัวเอง\nตาปรือ\nต้าอ่วย\nติงต๊อง\nติ๋ม\nติ่มซำ\nติว\nติวเตอร์\nตี๋\nตื้บ\nตุ๊ก\nตุ๊กตุ๊ก\nตุ๊ด\nตุ๋ย\nตู้เซฟ\nเต๊ะ\nเตี๊ยม\nแตงกวา\nแตงโม\nแต๋ว\nโต๋เต๋\nโต๊ะจีน\nไตรมาส\nถ่ายทำ\nถูกต้อง\nทงคัตสึ\nทริป\nทรู\nทอม\nท็อป\nทอร์นาโด\nทอล์ค\nทักซิโด\nทันตกรรม\nทันตแพทยศาสตร์\nทับซ้อน\nทัวร์\nทัวร์นาเมนต์\nทัวร์นาเมนท์\nทัวริสต์\nทาเลนต์\nทาวน์\nทาวน์เฮาส์\nทำงาน\nทิป\nทิพยสมบัติ\nทิวลิป\nทีรามิสุ\nทีวี\nทูน่า\nเท็กซ์\nเทค\nเทคโน\nเทคโนแครต\nเทควันโด\nเทป\nเทรด\nเทรนด์\nเทรนเนอร์\nเทรลเลอร์\nเทรลเล่อร์\nเทเลกราฟ\nเทวบัญชา\nเทวบุตร\nเทวา\nเทวาธิราช\nเทโวโรหนะ\nเทอร์โบ\nเที่ยงคืน\nเที่ยงวัน\nเทียมทาน\nแทกติค\nแทคติค\nแทงกั๊ก\nแทงโก้\nโทมาฮอก\nโทมาฮอว์ก\nโทมาฮอว์ค\nโทร\nโทรโข่ง\nไทม์\nไทยแลนด์\nไทเฮา\nธรรมา\nธรรมาภิบาล\nธัมโม\nธีม\nธุรกรรม\nธุหร่ำ\nเธค\nนพมาศ\nนรีแพทย์\nน็อก\nน็อค\nน้องใหม่\nนอมินี\nนอร์ท\nน่ะ\nนางแบบ\nนาฏยศาลา\nนายแบบ\nนายพราน\nนินจา\nนิรันดร์\nนิว\nนิวส์\nนู้ด\nเนอะ\nเนิร์สเซอรี\nเนิร์สเซอรี่\nเนี้ยบ\nโนติส\nไนท์\nไนน์\nบรรพชน\nบร็อกโคลี\nบร็อคโคลี\nบรา\nบริกร\nบริวเวอรี่ส์\nบลอนด์\nบลูเบอร์รี\nบลูเบอร์รี่\nบ๊วย\nบอกซ์\nบ็อกซ์\nบ๊อกซ์\nบอดี้\nบอนด์\nบ๊อบ\nบอมบ์\nบ๋อย\nบอยคอต\nบอยคอตต์\nบอร์ด\nบังเกอร์\nบัตเตอร์\nบัลลาสต์\nบัส\nบาบูน\nบาร์บีคิว\nบาร์บี้\nบาลานซ์\nบิ๊ก\nบิล\nบึม\nบึ้ม\nบุญคุณ\nบุ๋น\nบุปผา\nบู๊\nบูชิโด\nบูติก\nบูติค\nบูม\nเบเกอรี่\nเบญจมบพิตร\nเบตาดีน\nเบนโตะ\nเบนโล\nเบบี้\nเบลอ\nเบอร์เกอร์\nเบอร์รี\nเบิร์ด\nเบิร์น\nแบ็กโฮ\nแบคโฮ\nแบด\nแบต\nแบนเนอร์\nแบรนด์\nแบล็ก\nแบล็ค\nไบโอ\nโบกี้\nโบตั๋น\nโบ้ย\nโบรกเกอร์\nโบรชัวร์\nโบว์\nโบว์ลิ่ง\nไบเบิล\nปฏิสัมพันธ์\nป๊อก\nปอดแหก\nป๊อป\nป๋อหลอ\nปักขคณนา\nปัจเจกชน\nปัจฉิมนิเทศ\nป๊า\nป๋า\nป่าไม้\nปาร์ตี้\nปาสกาล\nปาสคาล\nปาสเตอร์\nปิกอัพ\nปิ๊ง\nปิโตรเคมี\nปิยมิตร\nปึ้ก\nปูอัด\nเปโซ\nเป็นไง\nเปปเปอร์มินต์\nเปเปอร์\nเปราะบาง\nเป๊ะ\nเป่ายิงฉุบ\nเป่ายิ้งฉุบ\nเปียโน\nแป้ก\nแป๋ว\nแป๊ะเจี๊ยะ\nโปร\nโปรเจกต์\nโปรเจ็กต์\nโปรเจกเตอร์\nโปรเจ็กเตอร์\nโปรเจคท์\nโปรเจ็คท์\nโปรดักชั่น\nโปรดิวเซอร์\nโปรโมชั่น\nโปรโมต\nโปรโมเตอร์\nโปรโมท\nโปลิศ\nโปสเตอร์\nผลไม้\nผลักดัน\nผ้าห่ม\nผิดพลาด\nผู้นำ\nแผดเผา\nเฝอ\nพงษ์\nพริตตี้\nพรีเซนต์\nพรีเซ็นเตอร์\nพรีเมียม\nพรีเมียร์\nพฤหัส\nพล็อต\nพลาซ่า\nพลานุภาพ\nพ่อค้า\nพอเพียง\nพะเรอ\nพันธกิจ\nพันธุวิศวกรรม\nพาร์\nพาร์ตเนอร์\nพาร์ทเนอร์\nพาวเวอร์\nพาสเจอร์ไรส์\nพาสตา\nพาสต้า\nพาสปอร์ต\nพาเหรด\nพิซซ่า\nพีเรียด\nพุดดิ้ง\nพุทธภูมิ\nพุทธศตวรรษ\nพุทโธ\nพูล\nเพทนาการ\nเพนกวิน\nเพนตากอน\nเพรส\nเพรียวบาง\nเพลซ\nเพลท\nเพลย์บอย\nเพียบแปร้\nเพียว\nเพาเวอร์\nแพกเกจ\nแพ็ค\nแพตเทิร์น\nแพทเทิร์น\nแพทยสภา\nแพนงเชิญ\nแพนดา\nแพนด้า\nแพลน\nโพลล์\nโพลารอยด์\nโพสต์\nไพลิน\nฟยอร์ด\nฟรังก์\nฟรุต\nฟลอร์\nฟลุก\nฟลุค\nฟลุต\nฟลุท\nฟอยล์\nฟอร์ม\nฟันด์\nฟาวล์\nฟาสต์ฟู้ด\nฟินิกซ์\nฟิวเจอร์\nฟีด\nฟีเวอร์\nฟุตบาท\nเฟรช\nเฟรชชี่\nเฟรม\nเฟมินิสต์\nเฟส\nเฟอร์นิเจอร์\nเฟอร์รี่\nเฟิร์ม\nเฟี้ยวฟ้าว\nแฟกซ์\nแฟ็กซ์\nแฟนซี\nแฟนตาซี\nแฟ้บ\nแฟร์\nแฟรนไชส์\nแฟรี\nแฟรี่\nแฟลช\nแฟล็ต\nโฟน\nโฟม\nโฟล์ค\nไฟต์\nไฟแนนซ์\nไฟลต์\nไฟลท์\nภควัทคีตา\nภควัมบดี\nภควัมปติ\nภคันทลาพาธ\nภววิสัย\nภารตะ\nภูมิทัศน์\nม้ง\nมวลชน\nมยุราภิรมย์\nมลภาวะ\nมหภาค\nมหาอุปราชา\nมอคคา\nมอคค่า\nมอนสเตอร์\nม็อบ\nมอบตัว\nมอยส์เจอไรเซอร์\nมอลล์\nมะกัน\nมั้ง\nมัฟฟิน\nมั้ย\nม้านั่ง\nมาเฟีย\nมาม่า\nมายองเนส\nมายาคติ\nมาร์ก\nมาร์เก็ต\nมาร์เก็ตติ้ง\nมาร์ค\nมาร์จิน\nมาร์ช\nมาร์ต\nมาร์ท\nมาราธอน\nม้าหินอ่อน\nมินต์\nมินท์\nมินิ\nมิลค์\nมิวสิค\nมิสซัง\nมิสไซล์\nมิสเตอร์\nมือถือ\nมุมมอง\nเมคอัพ\nเมจิก\nเมจิค\nเมทัล\nเมเปิล\nเมาท์\nเมี่ยงคำ\nแมกกาซีน\nแม็กกาซีน\nแมคเคอเรล\nแม่ค้า\nแมชชีน\nแมชีน\nแมนชั่น\nแมมบ้า\nแมมโบ้\nโมจิ\nโมเดล\nโมเดิร์น\nโมเต็ล\nโมโนเรล\nโมหจริต\nไมค์\nไมเกรน\nยนตรกรรม\nยอมรับ\nยะเยือก\nยังไง\nยากูซ่า\nยาวี\nยิม\nยิว\nยุวทูต\nยูโทเปีย\nยูโร\nยูวี\nเยน\nเยลลี่\nเย้ว\nเยอบีรา\nเยอบีร่า\nเยอร์บีรา\nเยอร์บีร่า\nแยมโรล\nโยเกิร์ต\nโยโย่\nรวมมิตร\nร็อค\nร็อคเก็ต\nรองรับ\nรอมฎอน\nรอยัลตี้\nระโงก\nรันเวย์\nรัม\nรากหญ้า\nราชบัณฑิตยสถาน\nราชานุญาต\nราชานุสาวรีย์\nรามเทพ\nรามาธิบดี\nรามายณะ\nราเม็ง\nราเมน\nรายชื่อ\nราสเบอร์รี\nริกเตอร์\nริคเตอร์\nรีไซเคิล\nรีดไถ\nรีทัช\nรีเทิร์น\nรีไทร์\nรีแบรนด์\nรีพอร์ท\nรีโมต\nรีโมท\nรีวิว\nรีสอร์ต\nรีสอร์ท\nรีเสิร์ช\nรุมบ้า\nรุสโซ\nรูบิก\nรูบิค\nเรซิน\nเรซิ่น\nเรดิโอ\nเรต\nเรตติ้ง\nแรงใจ\nแรงดูด\nแรงผลัก\nแรลลี\nแรลลี่\nโรดแมป\nโรเนียว\nโรแมนติก\nโรแมนติค\nโรล\nโรลออน\nไรเฟิล\nล็อกเกอร์\nลอจิสติกส์\nล็อต\nล็อบบี้\nลอร์ด\nล้มเหลว\nละติน\nละอ่อน\nลาซานญ่า\nลาติน\nลาเต้\nลานีญา\nลามะ\nลิมิต\nลิมูซีน\nลิสต์\nลีก\nลีด\nลีดเดอร์\nลีเมอร์\nลีลาวดี\nลุค\nลูกชาย\nลูกสาว\nเลกเชอร์\nเลคเชอร์\nเลดี้\nเลสเบี้ยน\nเลิฟ\nแลนด์\nแล็บ\nโลโก้\nโลชั่น\nไลท์\nไลน์\nไลฟ์\nวนาราม\nวราราม\nวโรกาส\nว้อดก้า\nวอเตอร์\nวอฟเฟิล\nว้อย\nวอร์ม\nวอร์มอัพ\nวอร์รูม\nวอล์ก\nวอล์ค\nวอลซ์\nวอลนัต\nวอลนัท\nวอลล์\nว่ะ\nวันเวย์\nวัสสา\nวาซาบิ\nวาทกรรม\nวาทะ\nวานิลลา\nวานิลา\nวาฟเฟิล\nวาริชศาสตร์\nว้าว\nวัคค์\nวัจนะ\nวาไรตี้\nวิก\nวิดีโอ\nวิทย์\nวิน\nวิป\nวิปปิ้ง\nวิภัชภาค\nวิว\nวิลล์\nวิลเลจ\nวีเจ\nวีซ่า\nวีดิทัศน์\nวีน\nวีไอพี\nวืด\nเวณิกา\nเวเฟอร์\nเวสต์\nเวอร์\nเวิร์ก\nเวิร์กช็อป\nเวิร์ค\nเวิร์ลด์\nเวิลด์\nแวมไพร์\nไวกิ้ง\nไวเบรเตอร์\nไวอะกร้า\nไวอากร้า\nศากยบุตร\nศิรินทร์\nศิลปวัฒนธรรม\nศิลปากร\nศิวิไลซ์\nศึกษาศาสตร์\nสกรัม\nสกาย\nสกู๊ป\nสเกตช์\nสเก็ตช์\nสคริปต์\nสแควร์\nสงบสุข\nสจ๊วต\nสตรอเบอร์รี\nสตรอเบอรี\nสตรอว์เบอร์รี\nสตริง\nสต็อก\nสต๊อก\nสต็อค\nสต๊อค\nสตอรี\nสตาร์\nสตาร์ท\nสติกเกอร์\nสติ๊กเกอร์\nสตีล\nสตูดิโอ\nสเตชัน\nสเตชั่น\nสเตเดียม\nสเตนเลส\nสเต็ป\nสเตย์\nสเตริโอ\nสเตอริโอ\nสแตนดาร์ด\nสแตนเลส\nสโตน\nสโตร์\nสไตรค์\nสไตล์\nสถาปัตย์\nสไนเปอร์\nสปอต\nสป็อต\nสปอนเซอร์\nสปอร์ต\nสปา\nสปาย\nสปิริต\nสเปก\nสเปค\nสไปเดอร์\nสมณพราหมณ์\nสมาพันธ์\nสมิติเวช\nสโรชา\nสลัม\nสแล็ก\nสโลแกน\nสโลว์\nสไลด์\nสวีท\nสหรัฐ\nสหัชญาณ\nสหัสวรรษ\nสะกอม\nสะเด่า\nสะบึม\nสะบึมส์\nสะออน\nสังโฆ\nสะโหลสะเหล\nสันทนาการ\nสัมนา\nสามช่า\nสามแยก\nสารขัณฑ์\nสี่แยก\nสึนามิ\nสุนทรีย์\nสุริยยาตร\nสุริยยาตร์\nสุหนี่\nเสกสรรค์\nเสพติด\nเสือโคร่ง\nหงวน\nหน่อมแน้ม\nหมวย\nหมั่นโถว\nหม่านโถว\nหมายปอง\nหมิง\nหยวน\nหลวงตา\nหลวงปู่\nหลวงพี่\nหล่อฮังก้วย\nหลินจือ\nห่วย\nเห็นด้วย\nเหมย\nเห่ย\nเหี่ยวย่น\nแหม็บ\nแหวว\nโหงว\nโหงวเฮ้ง\nโหลน\nโหลยโท่ย\nไหง\nไหร่\nอพาร์ตเมนต์\nอพาร์ตเมนท์\nอพาร์ทเมนต์\nอพาร์ทเมนท์\nอมาตยาธิปไตย\nอยุติธรรม\nอริยสงฆ์\nอ่วม\nอวอร์ด\nออกแบบ\nออดิชั่น\nออดิทอเรียม\nออเดอร์\nออโต้\nออทิสติก\nอ่อนด้อย\nออฟ\nออยล์\nออร์แกน\nออร์แกนิก\nออร์แกนิค\nออร์เดอร์\nออรัล\nออสซี่\nอะ\nอัตลักษณ์\nอัตวิสัย\nอันเดอร์\nอันตรกิริยา\nอัลตรา\nอัลไซเมอร์\nอัลบัม\nอัลบั้ม\nอัลมอนด์\nอาข่า\nอาโนเนะ\nอาฟเตอร์\nอาร์ติสต์\nอาร์พีจี\nอาว์\nอาสวะ\nอิกัวนา\nอินดอร์\nอินดัสตรีส์\nอินเตอร์\nอิ่มแปร้\nอิมพีเรียล\nอิเล็กทริก\nอิเล็กทริค\nอิเลียด\nอิสรชน\nอิเหนา\nอิออน\nอีแต๋น\nอีโรติก\nอีเวนท์\nอีสต์\nอีสเตอร์\nอึ๊บ\nอึ้ม\nอึ๋ม\nอึมครึม\nอุด้ง\nอุตสาหการ\nอุเทน\nอุปการคุณ\nอุปทาน\nอุปนายก\nอุปนายิกา\nอุปสงค์\nอุปัทวเหตุ\nอุรังคธาตุ\nอูคูเลเล่\nอู้ฟู่\nเอ๋\nเอ็กซ์โป\nเอ็กซ์เพรส\nเอ็กโซเซต์\nเอ็กโซเซ่ต์\nเอเซีย\nเอ็นจีโอ\nเอ็นเตอร์เทน\nเอนทรานซ์\nเอ็นทรานซ์\nเอฟเฟ็กต์\nเอเยนต์\nเอลนีโญ\nเอสเปรสโซ\nเอสเพรสโซ\nเอ๋อ\nเอาต์\nเอาท์\nเอาท์ดอร์\nเอ๊าะ\nแอ็กชั่น\nแอ็คชั่น\nแอคทีฟ\nแอดมิชชั่น\nแอดมิสชัน\nแอนด์\nแอ๊บแบ๊ว\nแอปเปิล\nแอปเปิ้ล\nแอปพริคอท\nแอพพริคอท\nแอพริคอต\nแอร์\nแอโรบิก\nแอโรบิค\nแอลมอนด์\nแอสเตอร์\nโอเค\nโอเปอเรเตอร์\nโอเปร่า\nโอเพ่น\nโอ้ย\nโอยัวะ\nโอรสาธิราช\nโอเลี้ยง\nโอวัลติน\nโอเวอร์\nไอซ์\nไอซียู\nไอดอล\nไอเดีย\nไอติม\nฮวงจุ้ย\nฮ่องเต้\nฮองเฮา\nฮอต\nฮ็อต\nฮอตดอก\nฮ็อตด็อก\nฮันนีมูน\nฮัม\nฮัลโลวีน\nฮัลโหล\nฮากกา\nฮาร์ด\nฮาราคีรี\nฮาลาล\nฮาโลวีน\nฮิ\nฮิต\nฮิบรู\nฮิปโป\nฮิปฮอป\nฮีโร่\nฮูลาฮูป\nฮูล่าฮูป\nเฮฟวี\nเฮฟวี่\nเฮอร์ริเคน\nเฮีย\nแฮนด์\nแฮปปี้\nแฮมเบอร์เกอร์\nโฮป\nโฮม\nโฮลดิงส์\nโฮลวีต\nโฮสเตส\nไฮกุ\nไฮแจ็ค\nไฮโซ\nไฮเทค\nไฮบริด\nไฮเปอร์\nไฮไลต์\nไฮไลท์\nไฮเวย์\nไฮสคูล\nไฮเอนด์\n\nกรีซ\nกัมพูชา\nกัวเตมาลา\nกาตาร์\nกานา\nกาบอง\nกายอานา\nกินี\nเกรนาดีนส์\nเกรเนดา\nเกาหลี\nแกมเบีย\nโกตดิวัวร์\nคองโก\nคอโมโรส\nคอสตาริกา\nคาซัคสถาน\nคิตส์\nคิริบาตี\nคิริบาส\nคิวบา\nคีร์กีซสถาน\nคูเวต\nเคนยา\nเคปเวิร์ด\nเคย์แมน\nแคนาดา\nแคเมอรูน\nโครเอเชีย\nโคลอมเบีย\nจอร์เจีย\nจอร์แดน\nจาเมกา\nจิบูตี\nจีน\nชาด\nชิลี\nเช็ก\nซามัว\nซาอุ\nซิมบับเว\nซีเรีย\nซูดาน\nซูรินาเม\nเซนต์\nเซเนกัล\nเซอร์เบีย\nเซาตูเม\nเซียร์รา\nแซมเบีย\nโซมาเลีย\nโซเวียต\nไซปรัส\nญี่ปุ่น\nดารุสซาลาม\nเดนมาร์ก\nโดมินิกัน\nโดมินิกา\nตรินิแดด\nตองกา\nติมอร์\nตุรกี\nตูนิเซีย\nเติร์กเมนิสถาน\nโตโก\nโตเบโก\nไต้หวัน\nทาจิกิสถาน\nแทนซาเนีย\nนอร์เวย์\nนามิเบีย\nนาอูรู\nนิการากัว\nนิวซีแลนด์\nเนเธอร์แลนด์\nเนปาล\nเนวิส\nไนจีเรีย\nไนเจอร์\nบราซิล\nบริติช\nบริเตน\nบรูไน\nบอตสวานา\nบอสเนีย\nบังกลาเทศ\nบังคลาเทศ\nบัลแกเรีย\nบาร์บูดา\nบาร์เบโดส\nบาห์เรน\nบาฮามาส\nบิสเซา\nบุรุนดี\nบูร์กินาฟาโซ\nเบนิน\nเบลเยียม\nเบลารุส\nเบลีซ\nเบอร์มิวดา\nโบลิเวีย\nปรินซิปี\nปากีสถาน\nปานามา\nปาปัวนิวกินี\nปารากวัย\nปาเลสไตน์\nปาเลา\nเปรู\nเปอร์เซีย\nเปอร์โตริโก\nโปรตุเกส\nโปแลนด์\nฝรั่งเศส\nพม่า\nฟิจิ\nฟินแลนด์\nฟิลิปปินส์\nเฟรนช์\nภูฏาน\nภูฐาน\nมองโกเลีย\nมอนเตเนโกร\nมอนแทนา\nมอริเชียส\nมอริเตเนีย\nมอลโดวา\nมอลตา\nมัลดีฟส์\nมาเก๊า\nมาซิโดเนีย\nมาดากัสการ์\nมาร์แชลล์\nมาลาวี\nมาลี\nมาเลเซีย\nเม็กซิโก\nเมียนมาร์\nโมซัมบิก\nโมนาโก\nโมนาโค\nโมร็อกโก\nไมโครนีเซีย\nยูกันดา\nยูโกสลาเวีย\nยูเครน\nเยเมน\nเยอรมนี\nรวันดา\nรัสเซีย\nโรมาเนีย\nลักเซมเบิร์ก\nลัตเวีย\nลาว\nลิกเตนสไตน์\nลิทัวเนีย\nลิเบีย\nลีโอน\nลูเซีย\nเลโซโท\nเลบานอน\nเลสเต\nไลบีเรีย\nวาติกัน\nวานูอาตู\nวินเซนต์\nเวเนซุเอลา\nเวียดนาม\nศรีลังกา\nสเปน\nสโลวะเกีย\nสโลวัก\nสโลวีเนีย\nสวาซิแลนด์\nสวิตเซอร์แลนด์\nสวีเดน\nสหรัฐ\nสหราชอาณาจักร\nสิกขิม\nสิงคโปร์\nอเมริกา\nออสเตรเลีย\nออสเตรีย\nอันดอร์รา\nอัฟกานิสถาน\nอาเซอร์ไบจาน\nอาร์เจนตินา\nอาร์เมเนีย\nอาระเบีย\nอิเควทอเรียล\nอิตาลี\nอินเดีย\nอินโดนีเซีย\nอิรัก\nอิสราเอล\nอิหร่าน\nอียิปต์\nอุซเบกิสถาน\nอุรุกวัย\nเอกวาดอร์\nเอธิโอเปีย\nเอมิเรตส์\nเอริเทรีย\nเอลซัลวาดอร์\nเอสโตเนีย\nแองโกลา\nแอนติกา\nแอลจีเรีย\nแอลเบเนีย\nโอมาน\nไอซ์แลนด์\nไอร์แลนด์\nฮ่องกง\nฮอนดูรัส\nฮังการี\nเฮติ\nเฮอร์เซโกวีนา\n\nกระบี่\nกรุงเทพ\nกาญจนบุรี\nกาฬสินธุ์\nกำแพงเพชร\nขอนแก่น\nจันทบุรี\nฉะเชิงเทรา\nชลบุรี\nชัยนาท\nชัยภูมิ\nชุมพร\nเชียงราย\nเชียงใหม่\nตรัง\nตราด\nตาก\nนครนายก\nนครปฐม\nนครพนม\nนครราชสีมา\nนครศรีธรรมราช\nนครสวรรค์\nนนทบุรี\nนราธิวาส\nน่าน\nบึงกาฬ\nบุรีรัมย์\nปทุมธานี\nประจวบคีรีขันธ์\nปราจีนบุรี\nปัตตานี\nพะเยา\nพังงา\nพัทลุง\nพิจิตร\nพิษณุโลก\nเพชรบุรี\nเพชรบูรณ์\nแพร่\nภูเก็ต\nมหาสารคาม\nมุกดาหาร\nแม่ฮ่องสอน\nยโสธร\nยะลา\nร้อยเอ็ด\nระนอง\nระยอง\nราชบุรี\nลพบุรี\nลำปาง\nลำพูน\nเลย\nศรีสะเกษ\nสกลนคร\nสงขลา\nสตูล\nสมุทรปราการ\nสมุทรสงคราม\nสมุทรสาคร\nสระแก้ว\nสระบุรี\nสิงห์บุรี\nสุโขทัย\nสุพรรณบุรี\nสุราษฎร์\nสุราษฎร์ธานี\nสุรินทร์\nหนองคาย\nหนองบัวลำภู\nอยุธยา\nอ่างทอง\nอำนาจเจริญ\nอุดรธานี\nอุตรดิตถ์\nอุทัยธานี\nอุบลราชธานี\nกันทรลักษ์\nจตุจักร\nไชยา\nซีคอน\nดอนเมือง\nถลาง\nไทรโยค\nธนบุรี\nธัญบุรี\nบางกอก\nบางปะกง\nบางระจัน\nปะทิว\nปาย\nพญาไท\nพัฒน์พงษ์\nพัทยา\nพารากอน\nภูมิซรอล\nรัตนาธิเบศร์\nรังสิต\nลันตา\nลาดพร้าว\nวโรรส\nวิภาวดี\nสตึก\nสมุย\nสัตหีบ\nสิมิลัน\nสุขุมวิท\nสุไหง\nเสลภูมิ\nอังรีดูนังต์\nอ่างขาง\nอินทนนท์\nเอ็มโพเรียม\n\nคิวชู\nแคริบเบียน\nแคสเปียน\nดานูบ\nตะนาวศรี\nนอร์วีเจียน\nนิโคบาร์\nเนรัญชรา\nไนล์\nบอร์เนียว\nบอลติก\nเบงกอล\nปิง\nแปซิฟิก\nมะละกา\nมินดาเนา\nมิสซิสซิปปี\nเมดิเตอร์เรเนียน\nเมโสโปเตเมีย\nยมุนา\nยุโรป\nยูเรเชีย\nยูเรเซีย\nแยงซี\nแยงซีเกียง\nสแกนดิเนเวีย\nสะโตง\nสาละวิน\nสุมาตรา\nสุเอซ\nอะเมซอน\nอันดามัน\nอัลไต\nอาร์กติก\nอาหรับ\nอินโดจีน\nอิรวดี\nอิระวดี\nอีเจียน\nอุษาคเณย์\nอูราล\nเอเชีย\nเอเดรียติก\nเอเวอเรสต์\nแอตแลนติก\nแอนตาร์กติก\nแอนตาร์กติกา\nแอฟริกา\nโอลิมปัส\nไอโอเนียน\nฮวงโห\nฮอกไกโด\nฮอนชู\n\nกบิลพัสดุ์\nกุสินารา\nโกลิยะ\nโกสัมพี\nโคตรบอง\nโคตรบูรณ์\nตองอู\nทรอย\nทวารวดี\nทวาราวดี\nเทวทหะ\nไทรบุรี\nนาลันทา\nไบแซนไทน์\nปรัสเซีย\nปัลลวะ\nปาฏลีบุตร\nพุทธคยา\nมถุรา\nมายัน\nมิถิลา\nราชคฤห์\nล้านช้าง\nล้านนา\nลุมพินี\nวัชชี\nเวสาลี\nสารนาถ\nสาวัตถี\nหริภุญชัย\nอโยธยา\nออตโตมัน\nอังวะ\nอินทปัตถ์\nอุชเชนี\n\nกราฟิก\nกราฟิกส์\nกราฟิค\nกริด\nกิกะไบต์\nกูเกิล\nกูเกิ้ล\nเกตเวย์\nโกลบอล\nคลัสเตอร์\nคลาส\nคลิก\nคลิปอาร์ต\nคอนโซล\nคอนเทนต์\nคอมพิวติ้ง\nคอมไพล์\nคอมไพเลอร์\nคอมมูนิเคชั่น\nคอร์\nคีย์\nคีย์บอร์ด\nเครือข่าย\nเคอร์เซอร์\nเคอร์เนล\nแคช\nแคมฟรอก\nแคมฟร็อก\nแคร็ก\nโค้ด\nจาวา\nจีพีเอส\nชิป\nชิพ\nเชลล์\nแช็ต\nแชนเนล\nแชนแนล\nซ็อกเก็ต\nซอฟต์แวร์\nซอฟท์แวร์\nซอร์ส\nซัพพอร์ต\nซัพพอร์ท\nซีดี\nซีดีรอม\nซีเนอร์\nเซิร์ฟเวอร์\nโซลูชัน\nโซลูชั่น\nไซต์\nไซเบอร์\nทรานแซกชัน\nทรานแซกชั่น\nทรานแซ็กชัน\nทรานแซ็กชั่น\nทรานแซคชัน\nทรานแซคชั่น\nทรานแซ็คชัน\nทรานแซ็คชั่น\nทวิตเตอร์\nทวีต\nทัชแพด\nเทมเพลต\nเทอร์มินัล\nแท็ก\nแท็บ\nแทบเล็ต\nโทรจัน\nเน็ต\nเน็ตบุ๊ก\nเน็ตบุค\nเน็ตบุ๊ค\nเน็ตเวิร์ก\nเน็ตเวิร์ค\nโน้ตบุ๊ก\nโน้ตบุค\nโน้ตบุ๊ค\nดอส\nดาวน์เกรด\nดาวน์โหลด\nดิจิตอล\nดิจิทัล\nดีบั๊ก\nดีวีดี\nดีไวซ์\nเดเบียน\nเดลไฟ\nเดสก์ท็อป\nโดเมน\nไดรว์\nไดรเวอร์\nไดเรกทอรี\nไดโอด\nเทเลคอม\nบล็อกเกอร์\nบรอดแบนด์\nบราวเซอร์\nบลูทูท\nบลูทูธ\nบลูเรย์\nบั๊ก\nบัฟเฟอร์\nบิต\nบิท\nบูต\nเบราว์เซอร์\nแบนด์วิดท์\nไบต์\nไบนารี\nโปรแกรมเมอร์\nโปรเซส\nโปรเซสเซอร์\nโปรโตคอล\nพร็อกซี\nพอร์ต\nพอร์ท\nพาเนล\nพาร์ทิชัน\nพารามิเตอร์\nพาสเวิร์ด\nพิกเซล\nเพจ\nเพจเจอร์\nแพกเก็ต\nแพตช์\nแพลตฟอร์ม\nโพรเซส\nโพรเซสเซอร์\nโพรโทคอล\nไพธอน\nฟล็อปส์\nฟอนต์\nฟอร์แมต\nฟอร์เวิร์ด\nฟอรัม\nฟีเจอร์\nเฟซบุ๊ก\nเฟิร์มแวร์\nแฟล็ก\nโฟลเดอร์\nไฟร์ฟอกซ์\nไฟร์วอลล์\nไฟล์\nมอดูล\nมอนิเตอร์\nมัลติ\nมัลติทัช\nมัลติเพล็กซ์\nมัลแวร์\nมาสเตอร์\nมีเดีย\nเมนู\nเมมโมรี\nเมล\nเมาส์\nแมค\nโมดูล\nโมเด็ม\nโมบาย\nโมบายล์\nโมไบล์\nไมโครซอฟท์\nยูนิกซ์\nยูนิโคด\nยูนิโค้ด\nริงโทน\nรีเฟรช\nรีเลย์\nเราเตอร์\nเรียลไทม์\nลิงก์\nลินุกซ์\nลีนุกซ์\nลูป\nเลเยอร์\nแล็ปท็อป\nไลเซนส์\nไลบรารี\nวิกิ\nวิกิพีเดีย\nวินโดวส์\nวินโดว์ส\nเว็บ\nเวอร์ชวล\nเวอร์ชัน\nเวอร์ชั่น\nเวิร์กสเตชัน\nเวิร์กสเตชั่น\nเวิร์คสเตชัน\nเวิร์คสเตชั่น\nเวิร์ด\nเวิร์ม\nไวแมกซ์\nสกรีน\nสแกน\nสแกนเนอร์\nสแต็ก\nสนิฟเฟอร์\nสปายแวร์\nสเปซ\nสแปม\nสมาร์ท\nสล็อต\nเสิร์ช\nโหลด\nออนไลน์\nออปติก\nออปติคอล\nออปติคัล\nออฟไลน์\nออราเคิล\nอัพเกรด\nอัพเดต\nอัพโหลด\nอัปเกรด\nอัปเดต\nอัปโหลด\nอัลกอริทึม\nอาร์กิวเมนต์\nอินเตอร์เน็ต\nอินทิเกรเตอร์\nอินเทอร์เน็ต\nอินเทอร์เฟซ\nอินเทล\nอินพุต\nอินพุท\nอีเมล\nอีเมล์\nอูบุนตู\nอูบันตู\nเอนจิน\nเอ็นจิน\nเอาต์พุต\nเอาต์พุท\nเอาท์พุต\nเอาท์พุท\nแอนะล็อก\nแอนิเมชัน\nแอนิเมชั่น\nแอปพลิเคชัน\nแอปพลิเคชั่น\nแอพพลิเคชัน\nแอพพลิเคชั่น\nแอสเซมบลี\nแอสเซมเบลอร์\nโอเพน\nไอคอน\nไอซี\nไอพอด\nไอพ็อด\nไอแพด\nไอโฟน\nฮับ\nฮาร์ดดิสก์\nฮาร์ดแวร์\nแฮกเกอร์\nแฮ็กเกอร์\nแฮนด์เฮลด์\nโฮสต์\n\nกรีก\nกัณณาฑ\nกัศมีรี\nคันจิ\nคาตาคานะ\nคุชราตี\nคุรุมุขี\nซีริลลิก\nซูลู\nเซลติก\nเซิร์บ\nตากาล็อก\nเตลุคู\nเติร์ก\nทราวิฑ\nเบงกาลี\nปัญจาบี\nพินอิน\nมลยาฬัม\nมองโกล\nมาราฐี\nมาเลย์\nเม็กซิกัน\nแมนจู\nแมนดาริน\nไมถิลี\nเยอรมัน\nรัสเซียน\nสวาฮิลี\nสวิส\nสินธี\nอูรดู\nอัสสมี\nอารบิก\nอิตาเลียน\nอุยกูร์\nแอฟริกัน\nโอริยา\nไอริช\nฮันกึล\nฮินดี\nฮิรางานะ\nฮีบรู\n\nกรีนพีซ\nกรือเซะ\nกวนอิม\nกวนอู\nกัดดาฟี\nกัลยาณวัตร\nกัสสปะ\nกาลิเลโอ\nกินเนส\nกุมภกรรณ\nกูเตนเบิร์ก\nเกตส์\nเกษมณี\nโกณฑัญญะ\nขงเบ้ง\nคริสโตเฟอร์\nคองคอร์ด\nคอลเกต\nคานธี\nคาเบรียล\nคาร์ฟูร์\nคาร์สัน\nคาราบาว\nคาสิโอ\nคิริน\nคุนลุ้น\nคูโบต้า\nเครมลิน\nแคทรีนา\nโคตมะ\nโคตมี\nโคลัมบัส\nไคฟง\nไครสเลอร์\nง้อไบ๊\nจตุพร\nจ็อบส์\nจอห์น\nจิ้น\nจิม\nจิ๋ม\nจิว\nจุฬาภรณ์\nจุฬาลงกรณ์\nเจมส์\nแจ็กสัน\nโจเซฟ\nโจว\nชมัยมรุเชฐ\nชมัยมรุเชษฐ์\nชเวดากอง\nชาร์ลส์\nชินราช\nชินวัตร\nชุนชิว\nเช็ง\nเชตวัน\nเชฟรอน\nเชฟโรเลต\nเชลซี\nไชยานุชิต\nซ่ง\nซังฮี้\nซัดดัม\nซันซิล\nซัมซุง\nซัวเจ๋ง\nซินหัว\nซีซาร์\nซีแพค\nซูซาน\nซูซูกิ\nซูบารุ\nเซ็นทารา\nเซเวน\nเซเว่น\nโซฟิเทล\nโซยุซ\nโซยูซ\nณัฐวุฒิ\nดาร์ลี่\nดาวโจนส์\nดิสนีย์\nดีแทค\nดูปองท์\nเดโมแครต\nเดลล์\nเดลินิวส์\nเดวิด\nแดวู\nโดนัลด์\nโดราเอมอน\nโดเรมอน\nต๋อง\nตั๊กม้อ\nตากสิน\nตาเมือน\nตาลีบัน\nตูน\nเตมีย์\nโต๋\nโตชิบา\nโตโยต้า\nถังซัมจั๋ง\nถังซำจั๋ง\nทรพา\nทราเวล\nทรูมูฟ\nทีปังกร\nเทปโก\nเทพรัตน\nเทวทัต\nเทสโก้\nโทมัส\nไททานิก\nไททานิค\nไทยรัฐ\nธีออส\nนครินทรา\nนโปเลียน\nนพดล\nนราดูร\nนเรนทร\nนอสตราดามุส\nนาซา\nนาซ่า\nนาซี\nนาโต\nนาโต้\nนาลแก\nนิคอน\nนิโคลัส\nนิด้า\nนินเทนโด\nนิปปอน\nนิวตัน\nนิสสัน\nเนคเทค\nเนชั่น\nเนชันแนล\nเนชั่นแนล\nเนวิน\nเนสเล่\nเนสาด\nแนท\nแนสแดค\nโนเกีย\nโนเบล\nโนเวลล์\nโนโวเทล\nไนเม็กซ์\nบรอดเวย์\nบรัดเลย์\nบรู๊ซ\nบัลเมอร์\nบารัก\nบารัค\nบู๊ตึ๊ง\nเบญกาย\nเบนซ์\nเบ็นซ์\nเบนจามิน\nโบตัน\nไบโอเทค\nประชาธิปัตย์\nปวีณา\nปอเต็กตึ๊ง\nปอเต๊กตึ๊ง\nป่อเต็กตึ๊ง\nปัตตะโชติ\nปารุสก์\nปีเตอร์\nปูติน\nเป๊ปซี่\nเป้ย\nเปอร์โยต์\nเปาบุ้นจิ้น\nโปเกมอน\nโป๊ยก่าย\nพรหมทัต\nพลาโต\nพอลล่า\nพานาโซนิค\nพานาโซนิก\nพิทยานุกูล\nพิมพิสาร\nเพนแทกซ์\nเพลโต\nไพโอเนียร์\nฟรอยด์\nฟรังซิส\nฟรานซิส\nฟลอเรนซ์\nฟอร์ด\nฟิลิปส์\nฟูจิ\nแฟซ่า\nโฟร์โมสต์\nภูมิพล\nภูริทัต\nมงฟอร์ต\nมณโฑ\nมติชน\nมหิตลาธิเบศร\nมโหสถ\nมัจฉานุ\nมาร์กาเร็ต\nมาร์ติน\nมาสด้า\nมิตซูบิชิ\nมิราเคิล\nมุสโสลินี\nเม้ง\nเมจิ\nเมอร์ซีเดส\nเมอร์เซเดส\nแมกซ์เวลล์\nแมกไซไซ\nแมคอินทอช\nแมชีนเนอรี่\nโมคคัลลานะ\nโมโตโรลา\nโมโตโรล่า\nโมเนีย\nไมเคิล\nไมยราพณ์\nยโสธรา\nยะโฮวา\nยะโฮวาห์\nยามาฮ่า\nยาเวห์\nยาฮู\nยูนิเซฟ\nยูเนสโก\nยูไล\nเยโฮวาห์\nรอยเตอร์\nรอยัล\nรัชดา\nรัสปูติน\nราฟาเอล\nรามาวตาร\nราเมศวร\nราหุล\nริชาร์ด\nรีพับลิกัน\nรูนีย์\nเรนโบว์\nแรมโบ้\nโรตารี\nโรนัลโด\nโรนัลโด้\nโรบินสัน\nโรเบิร์ต\nล็อกซเล่ย์\nลิงคอล์น\nลิจฉวี\nลิไท\nลิไทย\nลินคอล์น\nลิเวอร์พูล\nเลโนโว\nเลียดก๊ก\nโลตัส\nวชิราลงกรณ์\nวลาดิเมียร์\nวอลล์สตรีท\nวาเลนไทน์\nวิกตอเรีย\nวิทยานุสรณ์\nวิทยายน\nวิมเบิลดัน\nวิลเลียม\nวีระ\nวุฒิชัย\nเวียดกง\nไวตามิลค์\nศกุนตลา\nศรีวิชัย\nศิริพงษ์\nศิริราช\nศุภชลาศัย\nสดกก๊อกธม\nสดายุ\nสตาลิน\nสตีฟ\nสแตนฟอร์ด\nสวรินทิรา\nสังกัจจายน์\nสาทิตย์\nสารีบุตร\nสิริกิติ์\nสิรินธร\nสิหิงค์\nสีวลี\nสีหนุ\nสีหมุนี\nสีหโมนี\nสุครีพ\nสุทโธทนะ\nสุเทพ\nสุนทราภรณ์\nสุนีย์\nสุรนารี\nสุรยุทธ์\nสุริยาสน์\nเส้าหลิน\nโสกราตีส\nโสภิต\nหนุมาน\nหลินฮุ่ย\nหลุยส์\nเห้งเจีย\nไหหม่า\nองคต\nองคุลิมาล\nอชาตศัตรู\nอดุลยเดช\nอพอลโล\nอริสโตเติล\nอริสมันต์\nอลิซาเบธ\nอ๋อม\nออร์คิด\nออสการ์\nอะพอลโล\nอับราฮัม\nอั้ม\nอัลกออิดะห์\nอัลคาเทล\nอัลจาซีราห์\nอัลเฟรด\nอัลเลาะห์\nอัสซุส\nอัสสชิ\nอัสสัมชัญ\nอาเซม\nอาเซ็ม\nอาเซียน\nอาฟต้า\nอาร์เซนอล\nอินทรชิต\nอินทราทิตย์\nอีซูซุ\nอีเลฟเวน\nอีเลฟเว่น\nอุณรุท\nอุบลรัตน์\nอุบาลี\nอุ๋ย\nเอกทัศน์\nเอเซอร์\nเอ็ดเวิร์ด\nเอดิสัน\nเอแบค\nเอลิซาเบธ\nเอสพลานาด\nเอสพลานาร์ด\nแอคคอร์\nแอคคอร์ด\nแองเจลิน่า\nแอตแลนติส\nแอน\nแอ๋ม\nแอมบาสซาเดอร์\nแอมบาสเดอร์\nแอมเวย์\nแอ๋ว\nโอดีสซีย์\nโอเดียน\nโอบามา\nโอรสาราม\nโอลิมเปีย\nโออิชิ\nไอน์สไตน์\nฮอนด้า\nฮอปกินส์\nฮอลลีวูด\nฮอลลีวู้ด\nฮานามิ\nฮามาส\nฮิตเลอร์\nฮิตาชิ\nฮุนเซน\nฮุนเซ็น\nฮุนได\nฮุสเซ็น\nเฮนรี\nเฮนรี่\nเฮเลน\nโฮจิมินห์\nโฮปเวลล์\nโฮเมอร์\n\nกลีเซอรีน\nกำทอน\nแกนีมีด\nครอส\nคริสตัล\nคลอโรพลาสต์\nคลอไรด์\nควอนตัม\nคอนดักเตอร์\nคอปเปอร์\nคอลลาเจน\nคอเลสเตอรอล\nคอสมิก\nคูลอมบ์\nเคอราติน\nแคโรทีน\nแคสสินี\nโครมาโทกราฟี\nไคโตซาน\nจีโนม\nจุลชีววิทยา\nชิคุนกุนยา\nซัลฟิวริก\nซัลเฟต\nซัลไฟด์\nซิงค์\nซิริอุส\nซิลิกา\nซิลิเกต\nซิลิโคน\nซีเทน\nซีเวิร์ต\nเซ็กเตอร์\nเซ็กเมนต์\nเซมิ\nโซนาร์\nไซบอร์ก\nดอปเปลอร์\nดอปเพลอร์\nไดนามิก\nไดนามิกส์\nไดนามิค\nไดนามิคส์\nไดออกไซด์\nทรานส์\nทามิฟลู\nเทฟลอน\nเทสโทสเตอโรน\nเทอร์โม\nแทนนิน\nไททัน\nไทฟอยด์\nไทรอยด์\nธาลัสซีเมีย\nเนกาตีฟ\nโนวา\nบอแรกซ์\nโบทอกซ์\nโบท็อกซ์\nไบโอติน\nปฏิยานุพันธ์\nโปรเจสเตอโรน\nพอลิเมอร์\nพันธุศาสตร์\nพัลซาร์\nพาร์กินสัน\nพาราเซตามอล\nพาราโบลา\nเพอร์ออกไซด์\nโพรเจสเทอโรน\nโพลาไรซ์\nโพลิเมอร์\nโพลีเอทิลีน\nไพรเมต\nฟลาโวนอยด์\nฟลูออเรสเซนซ์\nฟลูออไรด์\nฟอสซิล\nฟิชชัน\nฟิวชัน\nฟีโรโมน\nไฟเบอร์\nมอนอกไซด์\nมิราจ\nเมตริกซ์\nเมทริกซ์\nเมลานิน\nเมลามีน\nโมเมนตัม\nไมโตคอนเดรีย\nไมโทคอนเดรีย\nยีสต์\nยูริก\nยูเรีย\nรูมาตอยด์\nวีก้า\nเวกเตอร์\nเวก้า\nเวสิเคิล\nโวลต์\nสเกล\nสเกลาร์\nสเต็ม\nสเตียรอยด์\nสปีชีส์\nสเปิร์ม\nสัมพัทธภาพ\nสุริยจักรวาล\nออกเทน\nออโรรา\nออโรร่า\nอะคริลิก\nอะครีลิก\nอะซีติก\nอะซีโตน\nอะมิโน\nอะลูมินา\nอันโดรเมดา\nอัลคาไลน์\nอัลตราซาวด์\nอัลตราซาวนด์\nอัลลอยด์\nอินทิกรัล\nอินทิเกรต\nอีโบลา\nอีโบล่า\nเอ็กซ์โพเนนเชียล\nเอทานอล\nเอทิลีน\nเอนโทรปี\nเอสเตอร์\nเอสโตรเจน\nเอสโทรเจน\nแอนดรอยด์\nแอนแทร็กซ์\nแอมพลิจูด\nแอมโมเนียม\nแอลกอฮอลิซึม\nแอสพาร์แตม\nโอเซลทามิเวียร์\nฮับเบิล\nฮิวมัส\nไฮดรอกไซด์\nไฮดรอลิก\nไฮโดรลิก\nไฮเพอร์โบลา\n\nกงเต็ก\nกฎุมพี\nกฏ\nกฏหมาย\nกบฎ\nกราไฟต์\nก๊อปปี้\nกะทะ\nกังวาล\nกุฎฐัง\nกุฏุมพี\nฃวด\nคฑา\nคลินิค\nคลีนิก\nคลีนิค\nคาทอลิค\nคึ่นฉ่าย\nแคตตาล็อก\nโควต้า\nฅน\nจุมพฎ\nช็อคโกแลต\nแซ่ด\nดัทช์\nทนง\nทรมาณ\nทราก\nทะแยง\nทิฏฐิ\nบล็อค\nบ๊องแบ๊ว\nบัลเล่ต์\nเบรค\nแบงค์\nปรากฎ\nปัคคหะ\nปาฏิโมกข์\nปิติ\nปิรามิด\nโปรเตสแตนท์\nพนิช\nพยักเพยิด\nพริ้ว\nพลูโตเนียม\nพากษ์\nเฟิร์น\nยากี้\nเยภุยยสิกา\nรุสเซีย\nฤาษี\nล็อค\nล็อคเกอร์\nวันทยาหัตถ์\nวานิช\nวิญญาน\nวิศิษฐ์\nศรีษะ\nสเปกโทรสโคป\nสฤษฎ์\nสลิ่ม\nสัตตสดก\nสาราณียากร\nสุกี้\nสุกี้ยากี้\nสูญญากาศ\nหยอมแหยม\nหยอย\nหล่ะ\nอะดรีนาลีน\nอะหลั่ย\nอัตคัต\nอัฟริกา\nอานิสงค์\nอาฟริกา\nอิริยาบท\nอิเลคโทรนิคส์\nอีรุงตุงนัง\nอุตรายัน\nอุลตรา\nอุลตร้า\nโอกาศ\n\nกกหู\nกงการ\nกงจักร\nกฎเกณฑ์\nกฎบัตร\nกฎหมาย\nกฎหมู่\nกดขี่\nกดดัน\nก้นกบ\nก้นบึ้ง\nก้นปล่อง\nกนิษฐภคินี\nกนิษฐภาดา\nกรงเล็บ\nกรมการ\nกรมท่า\nกรมธรรม์\nกรมนา\nกรมวัง\nกรรมกร\nกรรมการ\nกรรมฐาน\nกรรมบถ\nกรรมพันธุ์\nกรรมวิธี\nกรรมสิทธิ์\nกระจกเงา\nกระจกตา\nกระจกนูน\nกระจกเว้า\nกระจอกชวา\nกระจอกเทศ\nกระจ้อยร่อย\nกระจับบก\nกระจับปิ้ง\nกระจับปี่\nกระโชกโฮกฮาก\nกระดานดำ\nกระดานหก\nกระดาษแก้ว\nกระดาษไข\nกระดาษทราย\nกระดาษสา\nกระดูกงู\nกระทาชาย\nกระเท่เร่\nกระบวนการ\nกระบองเพชร\nกระผีกริ้น\nกระยาทิพย์\nกระยาสารท\nกระยาหาร\nกระสอบทราย\nกระสุนปืน\nกระแสจิต\nกระแสน้ำ\nกระแสลม\nกรับคู่\nกรับพวง\nกรับเสภา\nกราดเกรี้ยว\nกราวรูด\nกริ่งเกรง\nกรีฑาสถาน\nกรีดกราย\nกรี๊ดกร๊าด\nกลไก\nกลบท\nกลเม็ด\nกลยุทธ์\nกลวิธี\nกลศาสตร์\nกลอักษร\nกลบเกลื่อน\nกลมกล่อม\nกลมกลืน\nกลมเกลียว\nกล้วยแขก\nกล้วยไม้\nกล่องเสียง\nกล่อมเกลา\nกล่อมเกลี้ยง\nกลัดกลุ้ม\nกลัดมัน\nกลั่นกรอง\nกลั่นแกล้ง\nกลับกลอก\nกลางคน\nกลางคัน\nกลางค่ำ\nกลางคืน\nกลางแจ้ง\nกลางแปลง\nกลางวัน\nกลาดเกลื่อน\nกล่าวขวัญ\nกล่าวโทษ\nกล่าวหา\nกล้ำกราย\nกล้ำกลืน\nกลิ้งกลอก\nกลิ้งเกลือก\nกลิ่นอาย\nกลียุค\nกวดขัน\nกวัดแกว่ง\nกวัดไกว\nกวีนิพนธ์\nก่อกวน\nก่อการ\nก่อตั้ง\nก่อสร้าง\nก่อหวอด\nก้อร่อก้อติก\nกองกลาง\nกองเกิน\nกองโจร\nกองทัพ\nกองทุน\nกองพล\nกองพัน\nกองฟอน\nกองร้อย\nกองหนุน\nกอบโกย\nกะเกณฑ์\nกะบังลม\nกักกัน\nกักขัง\nกักตัว\nกักตุน\nกัณฑ์เทศน์\nกัดฟัน\nกันชน\nกันท่า\nกันสาด\nกันเอง\nกับแกล้ม\nกับข้าว\nกับระเบิด\nกากเพชร\nกากหมู\nกาชาด\nกาญจนาภิเษก\nก้านคอ\nกาฝาก\nก้ามกราม\nกามกิจ\nกามคุณ\nกามเทพ\nกามโรค\nก้ามปู\nกายกรรม\nกายบริหาร\nกายภาพ\nกายวิภาค\nกายสิทธิ์\nก่ายกอง\nการคลัง\nการเงิน\nการบ้าน\nการเปรียญ\nการเมือง\nการเรือน\nการละเล่น\nกาลกิริยา\nกาลเทศะ\nก้าวก่าย\nก้าวร้าว\nก้าวหน้า\nกาสาวพัสตร์\nกาฬพฤกษ์\nกาฬโรค\nกำปั้น\nกำมือ\nกำแพงขาว\nกำลังใจ\nกำลังม้า\nกี่งอำเภอ\nกิจกรรม\nกิจการ\nกิจวัตร\nกิจจะลักษณะ\nกิตติคุณ\nกิตติศัพท์\nกินขาด\nกินใจ\nกินดอง\nกินโต๊ะ\nกินแบ่ง\nกินเปล่า\nกินเลี้ยง\nกินเส้น\nกินแหนง\nกิโลกรัม\nกิโลเมตร\nกิโลลิตร\nกิโลเฮิรตซ์\nกีดกัน\nกีดกั้น\nกีดขวาง\nกึกก้อง\nกึกกัก\nกึกกือ\nกึ่งกลาง\nกุกกัก\nกุ๊กกิ๊ก\nกุ้งฝอย\nกุ้งมังกร\nกุ้งแห้ง\nกุ้งเต้น\nกุ้งยิง\nกุญแจผี\nกุญแจมือ\nกุญแจเลื่อน\nกุญแจเสียง\nกุลธิดา\nกุลบุตร\nกุลสตรี\nกู้ยืม\nเก้งก้าง\nเก็บกวาด\nเก็บเกี่ยว\nเก็บงำ\nเก็บตก\nเกรงกลัว\nเกรงใจ\nเกรงขาม\nเกรียงไกร\nเกรียมกรม\nเกรี้ยวกราด\nเกล็ดเลือด\nเกลี้ยกล่อม\nเกลี้ยงเกลา\nเกลือกกลั้ว\nเกลือกกลิ้ง\nเกลื่อนกล่น\nเกลื่อนกลาด\nเกศธาตุ\nเกษตรกร\nเกษตรกรรม\nเกษตรศาสตร์\nเกษมสันต์\nเกษียรสมุทร\nเก้อเขิน\nเกาะแกะ\nเกี่ยงงอน\nเกียจคร้าน\nเกียรติคุณ\nเกียรตินิยม\nเกียรติประวัติ\nเกียรติภูมิ\nเกียรติยศ\nเกียรติศักดิ์\nเกียรติมุข\nเกี่ยวข้อง\nเกี่ยวดอง\nเกี่ยวพัน\nเกี่ยวโยง\nเกี้ยวพาน\nเกี้ยวพาราสี\nแก่แดด\nแก้ขัด\nแก้ไข\nแก้ตัว\nแก้เผ็ด\nแก้ลำ\nแกงคั่ว\nแกงจืด\nแกงบวด\nแกงป่า\nแกงเผ็ด\nแกงส้ม\nแก่งแย่ง\nแก่นแก้ว\nแก่นสาร\nแก้วตา\nแก้วหู\nแกว่งกวัด\nแกว่งไกว\nแกะรอย\nโก้เก๋\nโกรธเกรี้ยว\nโกรธขึ้ง\nไก่เขี่ย\nไก่ชน\nไก่บ้าน\nไก่ป่า\nไก่ฟ้า\nไกล่เกลี่ย\nขจัดขจาย\nขนเพชร\nขนสัตว์\nขนหนู\nขนส่ง\nขนมจีน\nขบขัน\nขบวนการ\nข่มขี่\nข่มขู่\nข่มขืน\nข่มเหง\nขมหิน\nขมิ้นชัน\nขมุบขมิบ\nขยะแขยง\nขยักขย่อน\nขยักขย้อน\nขยับขยาย\nขยับเขยื้อน\nขวดโหล\nขวยเขิน\nขวัญใจ\nขวัญตา\nขวัญอ่อน\nขวากหนาม\nขวางโลก\nของ้าว\nขอสับ\nขอขมา\nขอทาน\nขอโทษ\nขอร้อง\nขออภัย\nข้อเขียน\nข้อความ\nข้อเท็จจริง\nของกลาง\nของขวัญ\nของแข็ง\nของชำ\nของลับ\nของเล่น\nของว่าง\nของเหลว\nของไหล\nของไหว้\nข้องใจ\nข้องแวะ\nขอบข่าย\nขอบเขต\nขอบคุณ\nขอบใจ\nขอบพระคุณ\nข้อมูล\nข้อแม้\nข้อหา\nข้อสังเกต\nขัดข้อง\nขัดขืน\nขัดเขิน\nขัดจังหวะ\nขัดดอก\nขัดแตะ\nขัดยอก\nขัดแย้ง\nขัดสมาธิ\nขันหมาก\nขันอาสา\nขับขี่\nขับเคี่ยว\nขั้วโลก\nขาจร\nขาประจำ\nข้าราชการ\nข้าศึก\nข้าหลวง\nขาดแคลน\nขาดใจ\nขาดตอน\nขาดตัว\nขาดทุน\nขาดเหลือ\nขายหน้า\nข่าวกรอง\nข่าวคราว\nข่าวล่า\nข่าวลือ\nข่าวสาร\nข้าวเกรียบ\nข้าวแกง\nข้าวของ\nข้าวจี่\nข้าวเจ้า\nข้าวซอย\nข้าวต้ม\nข้าวตอก\nข้าวตัง\nข้าวแตน\nข้าวทิพย์\nข้าวบิณฑ์\nข้าวเปลือก\nข้าวโพด\nข้าวฟ่าง\nข้าวเม่า\nข้าวสวย\nข้าวสาร\nข้าวเหนียว\nข้าวหมาก\nข้าวหลาม\nขี้เกียจ\nขี้ข้า\nขี้ครอก\nขี้คร้าน\nขี้คุก\nขี้ไคล\nขี้เซา\nขีดขั้น\nขีดคร่อม\nขีดคั่น\nขีดฆ่า\nขี้ตา\nขี้ตืด\nขี้เถ้า\nขี้ทูด\nขี้ปะติ๋ว\nขี้ผึ้ง\nขี้มูก\nขี้ยา\nขี้แย\nขี้ริ้ว\nขี้เรื้อน\nขี้เล็บ\nขี้หู\nขี้หน้า\nขี้เหนียว\nขี้เหล็ก\nขี้เหร่\nขึงขัง\nขึงพืด\nขึ้งโกรธ\nขึ้นใจ\nขึ้นชื่อ\nขึ้นมือ\nขืนใจ\nขื่นขม\nขุดคุ้ย\nขุนทอง\nขุนนาง\nขุนพล\nขุนศึก\nขุนหลวง\nขูดรีด\nเข็มกลัด\nเข็มขัด\nเข้มข้น\nเข้มแข็ง\nเข้มงวด\nเข็มทิศ\nเข็มหมุด\nเข้าขา\nเข้าเค้า\nเข้าใจ\nเข้าชื่อ\nเข้าตัว\nเข้าถึง\nเข้าทรง\nเข้าท่า\nเข้าที\nเข้าเนื้อ\nเข้าเล่ม\nเขียวเสวย\nเขียวหวาน\nแขกเต้า\nแข็งกร้าว\nแข็งกล้า\nแข็งแกร่ง\nแข็งข้อ\nแข็งขัน\nแข่งขัน\nแข็งใจ\nแข็งตัว\nแข็งเมือง\nแข็งแรง\nแขวนลอย\nโขกสับ\nโขยกเขยก\nไขข้อ\nไขควง\nไขมัน\nไข่มุก\nไขว่ห้าง\nไขสันหลัง\nไขสือ\nไข่เค็ม\nไข่เยี่ยวม้า\nไข่หงส์\nไข่เหี้ย\nคงกระพัน\nคงตัว\nคงทน\nคงที่\nคชลักษณ์\nคชสาร\nคชสีห์\nคชราช\nคณิตศาสตร์\nคดเคี้ยว\nคติธรรม\nคติพจน์\nคนกลาง\nคนไข้\nคนใช้\nคนทรง\nคบไฟ\nคบเพลิง\nคบค้า\nคบคิด\nคบหา\nคมคาย\nครบครัน\nครบถ้วน\nครอบครอง\nครอบคลุม\nครอบครัว\nครอบงำ\nครอบจักรวาล\nคริสตกาล\nคริสตจักร\nคริสต์มาส\nคริสต์ศตวรรษ\nคริสต์ศักราช\nคริสตัง\nคริสเตียน\nครุกรรม\nครุภัณฑ์\nครุศาสตร์\nครุฑพ่าห์\nครุ่นคิด\nคลอเคลีย\nคล่องแคล่ว\nคล่องตัว\nคล่องมือ\nคลั่งไคล้\nคลาคล่ำ\nคลาไคล\nคลาดเคลื่อน\nคลาดแคล้ว\nคลี่คลาย\nคลึงเคล้น\nคลึงเคล้า\nคลื่นไส้\nคลื่นเหียน\nคลุกคลี\nคลุกคลาน\nคลุมเครือ\nคลุมโปง\nคลุ้มคลั่ง\nควงสว่าน\nควบคุม\nควบคู่\nควบแน่น\nควันหลง\nความคิด\nความหลัง\nความเห็น\nคอหอย\nคอห่าน\nคอแห้ง\nค่อนขอด\nค่อนแคะ\nค้อนควัก\nคั่งค้าง\nคั่งแค้น\nคัดค้าน\nคัดง้าง\nคัดท้าย\nคัดเลือก\nคันจาม\nคันฉ่อง\nคันฉาย\nคันชัก\nคันชั่ง\nคันไถ\nคันนา\nคันเร่ง\nคับขัน\nคับคั่ง\nคับแค้น\nคับแคบ\nคางทูม\nคางหมู\nค้างคืน\nค้างปี\nคาดคั้น\nคาดเชือก\nคาดโทษ\nคาดหมาย\nคานหาม\nคาบเกี่ยว\nคาบศิลา\nคาบสมุทร\nคำขาด\nคำนำ\nคิดค้น\nคืนดี\nคืนตัว\nคุกเข่า\nคุณค่า\nคุณชาย\nคุณธรรม\nคุณนาย\nคุณภาพ\nคุณลักษณะ\nคุณวุฒิ\nคุณศัพท์\nคุณสมบัติ\nคุณหญิง\nคุณากร\nคุณูปการ\nคุโณปการ\nคุมเชิง\nคุ้มกัน\nคุยเขื่อง\nคุยโต\nคุ้ยเขี่ย\nคุ้มครอง\nคู่กรณี\nคู่กัด\nคู่ขา\nคู่แข่ง\nคู่ครอง\nคู่ควร\nคู่คิด\nคู่คี่\nคู่ใจ\nคู่ชีพ\nคู่ชีวิต\nคู่บารมี\nคู่บุญ\nคู่ปรปักษ์\nคู่ปรับ\nคู่ผสม\nคู่มือ\nคู่รัก\nคู่ลำดับ\nคู่สาย\nคู่หมั้น\nคู่หู\nคู่อริ\nคู่อาฆาต\nเคมีภัณฑ์\nเคยตัว\nเคร่งขรึม\nเคร่งครัด\nเคร่งเครียด\nเครดิตฟองซิเอร์\nเครื่องกล\nเครื่องกัณฑ์\nเครื่องแกง\nเครื่องเขิน\nเครื่องครัว\nเครื่องเคียง\nเครื่องเงิน\nเครื่องจักร\nเครื่องเซ่น\nเครื่องดนตรี\nเครื่องต้น\nเครื่องทุ่นแรง\nเครื่องเทศ\nเครื่องใน\nเครื่องบิน\nเครื่องบูชา\nเครื่องแบบ\nเครื่องประดับ\nเครื่องปรุง\nเครื่องปรุงรส\nเครื่องมือ\nเครื่องยนต์\nเครื่องร่อน\nเครื่องราง\nเครื่องเรือน\nเครื่องล่าง\nเครื่องเล่น\nเครื่องสาย\nเครื่องสำอาง\nเครื่องสุกำศพ\nเครื่องหมาย\nเครือรัฐ\nเคลียคลอ\nเคลื่อนที่\nเคลื่อนไหว\nเคลือบแคลง\nเคลือบแฝง\nเคลือบฟัน\nเคว้งคว้าง\nเคหสถาน\nเค้าโครง\nเคียดแค้น\nเคี่ยวเข็ญ\nเคี้ยวเอื้อง\nเคืองขุ่น\nโคนม\nโคบาล\nโคมูตร\nโคมลอย\nโครงการ\nโครงเรื่อง\nโครงงาน\nโครงสร้าง\nโครมคราม\nโคลงเคลง\nฆ้องกระแต\nฆ้องชัย\nฆ้องวง\nฆ้องหุ่ย\nฆ้องเหม่ง\nฆ้องโหม่ง\nฆาตกร\nฆาตกรรม\nฆานประสาท\nงงงวย\nงงงัน\nงดเว้น\nงบดุล\nงบประมาณ\nงมโข่ง\nงมงาย\nง่วงงุน\nง่วงเหงา\nงอหาย\nง้องอน\nงอนง้อ\nงอมแงม\nงาช้าง\nง่าเงย\nงานการ\nง่ายดาย\nงึมงำ\nเงินเดือน\nเงินตรา\nเงินยวง\nเงียบกริบ\nเงียบเชียบ\nเงียบเหงา\nเงื่องหงอย\nเงื่อนไข\nเงื่อนงำ\nเงื่อนเวลา\nเงื้อมมือ\nแง่งอน\nจงใจ\nจงรัก\nจดจ่อ\nจดจำ\nจดหมาย\nจดหมายเหตุ\nจรจัด\nจรรยาบรรณ\nจริงจัง\nจริงใจ\nจอมขวัญ\nจอมใจ\nจอมทัพ\nจอมปลวก\nจอมพล\nจ๊ะเอ๋\nจักสาน\nจักรพรรดิ\nจักรภพ\nจักรยาน\nจักรยานยนต์\nจักรราศี\nจักรวรรดิ\nจักรวรรดินิยม\nจักรวาล\nจังหนับ\nจัดการ\nจัดจ้าน\nจัดเจน\nจัดแจง\nจัดตั้ง\nจัดสรร\nจับกุม\nจับจด\nจับเจ่า\nจ่าหน้า\nจาตุทสี\nจาตุมหาราช\nจาตุมหาราชิก\nจาตุมหาราชิกา\nจาตุรงคสันนิบาต\nจาตุรราชการ\nจานเชิง\nจานบิน\nจานผี\nจานเสียง\nจาบจ้วง\nจำเป็น\nจำพรรษา\nจำวัด\nจ้ำจี้จ้ำไช\nจำเลาะตา\nจิงโจ้น้ำ\nจิตใจ\nจิตตภาวนา\nจิตตัง\nจิตตานุปัสสนา\nจิตนิยม\nจิตบำบัด\nจิตแพทย์\nจิตวิสัย\nจิตรกร\nจิตรกรรม\nจิตรลดา\nจิตวิทยา\nจิตเวช\nจิตเวชศาสตร์\nจินตกวี\nจินตนา\nจินตนาการ\nจินตภาพ\nจุฑามณี\nจุฑามาศ\nจุฑารัตน์\nจุนเจือ\nจุ้นจ้าน\nจุลชีพ\nจุลชีวัน\nจุลชีวิน\nจุลทรรศน์\nจุลภาค\nจุลวรรค\nจุลศักราช\nจุลสาร\nจุลินทรีย์\nจุฬามณี\nจุฬาลักษณ์\nเจตคติ\nเจตจำนง\nเจตนารมณ์\nเจตภูต\nเจริญพร\nเจ้ากรม\nเจ้ากรรม\nเจ้าของ\nเจ้าขา\nเจ้าข้า\nเจ้าคณะ\nเจ้าค่ะ\nเจ้าจอม\nเจ้าชู้\nเจ้าตัว\nเจ้าถิ่น\nเจ้าท่า\nเจ้าที่\nเจ้าทุกข์\nเจ้านาย\nเจ้าเนื้อ\nเจ้าบ้าน\nเจ้าบ่าว\nเจ้าประคุณ\nเจ้าประคู้น\nเจ้าพนักงาน\nเจ้าพระคุณ\nเจ้าพระยา\nเจ้าพ่อ\nเจ้าพายุ\nเจ้าฟ้า\nเจ้าภาพ\nเจ้ามือ\nเจ้าแม่\nเจ้าเรือน\nเจ้าสังกัด\nเจ้าสัว\nเจ้าสาว\nเจ้าหน้าที่\nเจ้าหนี้\nเจ้าอาวาส\nเจาะจง\nเจือจาง\nเจือจาน\nเจือปน\nเจื้อยแจ้ว\nแจกจ่าย\nแจ่มแจ้ง\nแจ่มใส\nโจงกระเบน\nโจมตี\nโจรกรรม\nโจรสลัด\nใจความ\nใจคอ\nฉกฉวย\nฉกชิง\nฉลองได\nฉ้อฉล\nฉัตรมงคล\nฉันทลักษณ์\nฉายาลักษณ์\nฉิบหาย\nฉุกเฉิน\nฉุกละหุก\nฉุนเฉียว\nฉุปศาสตร์\nเฉไฉ\nเฉยเมย\nเฉาโฉด\nเฉิดฉัน\nเฉิดฉาย\nเฉิดฉิน\nเฉียบขาด\nเฉียบพลัน\nเฉียบแหลม\nเฉื่อยชา\nแฉะแบะ\nโฉดเฉา\nโฉมงาม\nโฉมฉาย\nโฉมเฉลา\nโฉมตรู\nโฉมยง\nโฉมศรี\nโฉมหน้า\nชดช้อย\nชดเชย\nชดใช้\nชนบท\nชนินทร์\nชนกกรรม\nชนมพรรษา\nชนมายุ\nชมเชย\nชมพูทวีป\nชมพูนท\nชมพูนุท\nชราธรรม\nชราภาพ\nชลจร\nชลธาร\nชลธี\nชลนัยน์\nชลนา\nชลเนตร\nชลประทาน\nชลมารค\nชลาธาร\nชลาลัย\nชลาศัย\nชลาสินธุ์\nชโลทร\nช่วงชิง\nช่วงใช้\nชวนชม\nชวนหัว\nช่วยเหลือ\nช่อฟ้า\nช่อม่วง\nชอกช้ำ\nช่องเขา\nช่องแคบ\nช่องไฟ\nช่องว่าง\nช้องนาง\nชอบกล\nชอบใจ\nชอบธรรม\nชอบพอ\nชักโครก\nชักเงา\nชักจูง\nชักชวน\nชักนำ\nชักเนื้อ\nชักพระ\nชักเย่อ\nชักใย\nชั่งใจ\nชังฆวิหาร\nชัดเจน\nชั้นเชิง\nชั่วคน\nชั่วคราว\nชั่วช้า\nชั่วโมง\nชั่วแล่น\nชาเย็น\nช้านาน\nช่างเครื่อง\nช่างฝีมือ\nช่างฟิต\nช่างไฟ\nช้างน้ำ\nช้างเผือก\nช้างพลาย\nช้างพัง\nช้างสาร\nช้างสีดอ\nชาติธรรม\nชาตินิยม\nชาติพันธุ์\nชาติพันธุ์วิทยา\nชาติภูมิ\nชานชาลา\nชายชาตรี\nชายคา\nชายฝั่ง\nชายทะเล\nชาวเล\nชาววัง\nช้ำใจ\nช้ำชอก\nชิงชัง\nชิงพลบ\nชินชา\nชินบุตร\nชิ้นเอก\nชิมลาง\nชีเปลือย\nชี้ขาด\nชี้แจง\nชี้นำ\nชี้แนะ\nชี้ฟ้า\nชีพจร\nชีพิตักษัย\nชื่นชม\nชื่นบาน\nชื่นมื่น\nชื่อย่อ\nชื่อรอง\nชื่อเล่น\nชื่อเสียง\nชุกชุม\nชุติมา\nชุบตัว\nชุบเลี้ยง\nชุมชน\nชุมทาง\nชุมสาย\nชุ่มใจ\nชุ่มชื่น\nชุ่มชื้น\nชุมนุมชน\nชูชีพ\nชูโรง\nชู้สาว\nเชยชม\nเชลยศักดิ์\nเชลยศึก\nเช่าซื้อ\nเช้าตรู่\nเช้ามืด\nเชิงกราน\nเชิงกล\nเชิงชั้น\nเชิงชาย\nเชิงซ้อน\nเชิงเดียว\nเชิงเดี่ยว\nเชิงตะกอน\nเชิงเทิน\nเชิงมุม\nเชิดชู\nเชิงอรรถ\nเชี่ยนหมาก\nเชี่ยวชาญ\nเชื่องช้า\nเชื่อใจ\nเชื่อถือ\nเชื่อฟัง\nเชื่อมือ\nเชื้อชาติ\nเชื้อเพลิง\nเชื้อไฟ\nเชื้อโรค\nเชื้อสาย\nเชื้อเชิญ\nเชื่องช้า\nเชือนแช\nเชื่อวัน\nแช่เย็น\nแช่อิ่ม\nแช่มช้อย\nแช่มชื่น\nโชกโชน\nโชติช่วง\nโชติรส\nใช้สอย\nซบเซา\nซมซาน\nซวนเซ\nซอกซอน\nซอกแซก\nซ่องสุม\nซ่องเสพ\nซ่องแซ่ง\nซ่อนรูป\nซ่อนเร้น\nซ่อนหา\nซ่อนกลิ่น\nซ่อนทราย\nซ่อมแซม\nซักค้าน\nซักซ้อม\nซักไซ้\nซักฟอก\nซักแห้ง\nซังกะตาย\nซังตาย\nซัดเซ\nซัดทอด\nซับซ้อน\nซับใน\nซับพระพักตร์\nซากศพ\nซ่านเซ็น\nซ้ำซ้อน\nซ้ำซาก\nซ้ำเติม\nซ้ำร้าย\nซี่โครง\nซีดเซียว\nซึมกะทือ\nซึมซาบ\nซึมเซา\nซึมทราบ\nซึมเศร้า\nซื่อตรง\nซื่อสัตย์\nซื้อขาย\nซุกซน\nซุกซ่อน\nซุบซิบ\nซู่ซ่า\nเซซัง\nเซ่อซ่า\nแซ่ซ้อง\nโซดาไฟ\nญาณทัสนะ\nญาณวิทยา\nญาณศาสตร์\nญาติกา\nฐานราก\nดกดื่น\nดงดิบ\nดลใจ\nดลบันดาล\nดวงแก้ว\nดวงใจ\nดวงเดือน\nดวงตรา\nดวงตา\nดวงสมร\nดอกจัน\nดอกจิก\nดอกบัว\nดอกเบี้ย\nดอกฟ้า\nดอกไม้\nดอกยาง\nดอกเล็บ\nดอกทอง\nดอกสร้อย\nดองยา\nดักคอ\nดักฟัง\nดังนั้น\nดังนี้\nดังหนึ่ง\nดั้งเดิม\nดัดจริต\nดัดแปลง\nดันทุรัง\nดับขันธ์\nดับจิต\nดับชีพ\nด่าทอ\nด่างทับทิม\nด่างพร้อย\nดาดฟ้า\nดาราศาสตร์\nดาลเดือด\nดาวกระจาย\nดาวเคราะห์\nดาวตก\nดาวเทียม\nดาวรุ่ง\nดาวเรือง\nดาวฤกษ์\nดาวหาง\nดาวเหนือ\nดาษดื่น\nดินขาว\nดินดาน\nดินดำ\nดินประสิว\nดินปืน\nดินระเบิด\nดินสอ\nดินสอพอง\nดิ้นรน\nดิบดี\nดีเกลือ\nดีใจ\nดีซ่าน\nดีดัก\nดีเดือด\nดีฝ่อ\nดีดดิ้น\nดึกดำบรรพ์\nดึกดื่น\nดึงดัน\nดึงดูด\nดื่มด่ำ\nดื้อด้าน\nดื้อดึง\nดื้อแพ่ง\nดื้อยา\nดื้อรั้น\nดุดัน\nดุเดือด\nดุร้าย\nดุลการค้า\nดุลพินิจ\nดุลภาค\nดุลยพินิจ\nดุลยภาพ\nดุษฎีนิพนธ์\nดุษฎีบัณฑิต\nดุษณีภาพ\nดูแคลน\nดูถูก\nดูดาย\nดูเบา\nดูแล\nดูหมิ่น\nดูเหมือน\nดูดดื่ม\nเด็ดขาด\nเด็ดดวง\nเด็ดเดี่ยว\nเดนตาย\nเดาสวด\nเดาสุ่ม\nเดินทาง\nเดินสะพัด\nเดินสาย\nเดินเหิน\nเดิมพัน\nเดียงสา\nเดียดฉันท์\nเดียวกัน\nเดียวดาย\nเดี๋ยวเดียว\nเดี๋ยวนี้\nเดือดดาล\nเดือดร้อน\nเดือนมืด\nเดือนหงาย\nแดดาล\nแดดิ้น\nแดกดัน\nโด่เด่\nโด่งดัง\nโดดเดี่ยว\nโดยสาร\nได้การ\nได้แก่\nได้ใจ\nได้ที\nได้ยิน\nได้เสีย\nตกเขียว\nตกค้าง\nตกใจ\nตกต่ำ\nตกแต่ง\nตกทอด\nตกฟาก\nตกมัน\nตกยาก\nตกลง\nตกหล่น\nต้นขั้ว\nต้นคิด\nต้นฉบับ\nต้นตอ\nต้นตำรับ\nต้นทุน\nต้นแบบ\nต้นเพลิง\nต้นมือ\nต้นไม้\nต้นร่าง\nต้นเรื่อง\nต้นสังกัด\nต้นหน\nต้นเหตุ\nตบตา\nตบแต่ง\nตบแผละ\nตบมือ\nต้มข่า\nต้มโคล้ง\nต้มยำ\nต้มส้ม\nตรมตรอม\nตรรกวิทยา\nตรรกศาสตร์\nตรวจการ\nตรวจการณ์\nตรวจตรา\nตระบัดสัตย์\nตรัสรู้\nตราตั้ง\nตราบาป\nตรายาง\nตราสาร\nตริตรอง\nตรีกฏุก\nตรีกาย\nตรีโกณ\nตรีโกณมิติ\nตรีคูณ\nตรีทูต\nตรีปิฎก\nตรีภพ\nตรีมูรติ\nตรึกตรอง\nตรึงตรา\nตรุษจีน\nตฤณชาติ\nตฤณมัย\nตลกบาตร\nตลบตะแลง\nตลบหลัง\nตลาดนัด\nตลาดน้ำ\nตลาดมืด\nตลาดสด\nต่อตี\nต่อเติม\nต่อว่า\nต่อสู้\nต่อกร\nต่อต้าน\nต่อแย้ง\nต้องการ\nต้องโทษ\nต้องหา\nต้อนรับ\nตอบโต้\nตอบแทน\nต่อยหอย\nตะพาบน้ำ\nตักตวง\nตักบาตร\nตั้งเข็ม\nตั้งไข่\nตั้งเค้า\nตั้งแง่\nตั้งใจ\nตั้งต้น\nตั้งแต่\nตั้งท้อง\nตัดขาด\nตัดใจ\nตัดเชือก\nตัดตอน\nตัดทอน\nตัดบท\nตัดพ้อ\nตัดรอน\nตัดสิน\nตับเต่า\nตับแลบ\nตับอ่อน\nตัวกลาง\nตัวการ\nตัวเก็ง\nตัวดี\nตัวตั้ง\nตัวเต็ง\nตัวถัง\nตัวแทน\nตัวประกอบ\nตัวประกัน\nตัวแปร\nตัวผู้\nตัวพิมพ์\nตัวเมีย\nตัวยืน\nตัวเลข\nตัวอย่าง\nตั๋วเงิน\nตั๋วแลกเงิน\nตากล้อง\nตาไก่\nตาข่าย\nตาชั่ง\nตาตุ่ม\nตาทวด\nตาปลา\nตาราง\nต่างหาก\nต้านทาน\nตามใจ\nตายใจ\nตายซาก\nตายด้าน\nตายตัว\nตายทั้งกลม\nตายห่า\nตายโหง\nตาลปัตร\nต่ำช้า\nต่ำต้อย\nตำส้ม\nติเตียน\nติณชาติ\nติดขัด\nติดใจ\nติดต่อ\nติดตั้ง\nติดตาม\nติดตื้น\nติดพัน\nติดลม\nติดอ่าง\nตีเกลียว\nตีขลุม\nตีความ\nตีคู่\nตีจาก\nตีตื้น\nตีแผ่\nตีรวน\nตีลังกา\nตีวง\nตีเสมอ\nตีนกา\nตีนคู้\nตีนจก\nตีนตะขาบ\nตีนผี\nตีนเหยียด\nตึกแถว\nตึกระฟ้า\nตึงเครียด\nตึงตัง\nตื้นตัน\nตื่นตัว\nตื่นตูม\nตื่นเต้น\nตุ๊ต๊ะ\nตุ้บตั้บ\nตุ้มหู\nตุลาการ\nตุลาคม\nตู้นิรภัย\nตูมตาม\nตู้เสบียง\nเตโชธาตุ\nเตร็ดเตร่\nเต้นรำ\nเตาแก๊ส\nเตาผิง\nเตาฟู่\nเตาไฟ\nเตารีด\nเตาสูบ\nเต่าทอง\nเต้ารับ\nเต้าส่วน\nเต้าเสียบ\nเต้าหู้ยี้\nแต่ละ\nแตกคอ\nแตกคอก\nแตกฉาน\nแตกดับ\nแตกตื่น\nแตกพาน\nแตกแยก\nแตกร้าว\nแตกหัก\nแต่งงาน\nแต่งตั้ง\nแต้มคู\nแต้มต่อ\nแตรงอน\nแตรเดี่ยว\nแตรฝรั่ง\nแตรฟันฟาร์\nแตรวง\nโต้ตอบ\nโต้เถียง\nโต้แย้ง\nโต๊ะหมู่\nโต๊ะอิหม่าม\nใต้ถุน\nไต้ก๋ง\nไต่คู้\nไต่เต้า\nไต่ถาม\nไต้ฝุ่น\nไตรจักร\nไตรจีวร\nไตรตรึงษ์\nไตรทวาร\nไตรปิฎก\nไตรเพท\nไตรภพ\nไตรภูมิ\nไตรภาคี\nไตรยางศ์\nไตรรงค์\nไตรรัตน์\nไตรลักษณ์\nไตรโลก\nไตรสรณคมน์\nไตรสิกขา\nไต่สวน\nถกเถียง\nถดถอย\nถนัดถนี่\nถนิมสร้อย\nถมถืด\nถมเถ\nถมไป\nถลากไถล\nถ้วนถี่\nถ้วยฟู\nถ่องแท้\nถอดถอน\nถ้อยคำ\nถ้อยแถลง\nถากถาง\nถ่านไฟฉาย\nถ่านหิน\nถามไถ่\nถ่ายทอด\nถ่ายทุกข์\nถ่ายเท\nถาวรวัตถุ\nถ้ำมอง\nถี่ถ้วน\nถึงใจ\nถูกใจ\nถูกชะตา\nเถรวาท\nเถ้าแก่\nเถ้าแก่เนี้ย\nแถมพก\nแถลงการณ์\nไถ่ถอน\nไถ่ถาม\nทดแทน\nทดรอง\nทดลอง\nทดสอบ\nทนทาน\nทนายความ\nทบทวน\nทแยงมุม\nทรงกลด\nทรงเครื่อง\nทรงเจ้า\nทรัพย์สิน\nทรามชม\nทรามเชย\nทรามวัย\nทรามสงวน\nทรามสวาท\nทรุดโทรม\nทฤษฎีบท\nท้วงติง\nท่วมท้น\nทวาทศ\nทวาทศมาส\nทวาบรยุค\nทวารบาล\nทวิบถ\nทวิบท\nทวิบาท\nทวิภาค\nทวิภาคี\nทวีคูณ\nทศกัณฐ์\nทศชาติ\nทศทิศ\nทศนิยม\nทศพร\nทศพล\nทศพิธราชธรรม\nทศมาส\nทศวรรษ\nท่อไอเสีย\nท้อถอย\nท้อแท้\nทองขาว\nทองคำ\nทองคำขาว\nทองคำเปลว\nทองเค\nทองแดง\nทองบรอนซ์\nทองม้วน\nทองย้อย\nทองสัมฤทธิ์\nทองหยอด\nทองหยิบ\nทองเหลือง\nทองเอก\nท่องเที่ยว\nท้องตรา\nท้องถิ่น\nท้องที่\nท้องน้อย\nท้องร่อง\nท้องเรื่อง\nทอดมัน\nทอดทิ้ง\nทอดน่อง\nทอดยอด\nทอดหุ่ย\nทอยกอง\nทะเบียนบ้าน\nทะเลทราย\nทะเลสาบ\nทะเลหลวง\nทักขิณาวัฏ\nทักท้วง\nทักทาย\nทักษิณาวรรต\nทักษิณาทาน\nทักษิณานุประทาน\nทั้งกลม\nทั้งคน\nทั้งดุ้น\nทั้งที\nทั้งนั้น\nทั้งนี้\nทั้งปวง\nทั้งผอง\nทั้งเพ\nทั้งมวล\nทั้งสิ้น\nทั้งหมด\nทั้งหลาย\nทัณฑ์บน\nทัดทาน\nทัดเทียม\nทันควัน\nทันใจ\nทันใด\nทันตา\nทันสมัย\nทันที\nทับถม\nทับทรวง\nทับศัพท์\nทั่วถึง\nทั่วไป\nท่าทาง\nท่าที\nท้าทาย\nทางการ\nทางข้าม\nทางด่วน\nทางเท้า\nทางโท\nทางใน\nทางผ่าน\nทางม้าลาย\nทางหลวง\nทางออก\nทางเอก\nทานกัณฑ์\nทานตะวัน\nท่านชาย\nทานบารมี\nท่านผู้หญิง\nท่านหญิง\nทาบทาม\nท้ายทอย\nทารุณกรรม\nทำคลอด\nทำใจ\nทำซ้ำ\nทำท่า\nทำที\nทำแท้ง\nทำโทษ\nทำบาป\nทำบุญ\nทำพิษ\nทำฟัน\nทำร้าย\nทำวัตร\nทำสาว\nทำเสน่ห์\nทำหมัน\nทำให้\nทิ้งขว้าง\nทิ้งทวน\nทิ้งท้าย\nทินกร\nทิพจักขุ\nทิพโสต\nทิพยจักษุ\nทิพยญาณ\nทิพยเนตร\nทิพยรส\nทิพากร\nทิ่มตำ\nทิ่มแทง\nทิวากร\nทิวากาล\nทิศทาง\nทีเด็ด\nทีท่า\nทีนี้\nทีหลัง\nทีฆนิกาย\nทีฆสระ\nที่ดิน\nที่นอน\nที่นั่ง\nที่ปรึกษา\nที่พึ่ง\nที่มั่น\nที่ราบ\nที่ว่าการ\nที่สุด\nที่หมาย\nที่ไหน\nทุกที\nทุกเมื่อ\nทุกข์สุข\nทุนทรัพย์\nทุนนิยม\nทุนรอน\nทุนสำรอง\nทุ่มเถียง\nทุ่มเท\nทูนหัว\nทูลกระหม่อม\nเทกระจาด\nเทครัว\nเทพเจ้า\nเทพดา\nเทพธิดา\nเทพนม\nเทพนิยม\nเทพนิยาย\nเทพบุตร\nเทพสังหรณ์\nเทศกาล\nเทศนาโวหาร\nเทศบัญญัติ\nเทศบาล\nเทศมนตรี\nเทห์ฟากฟ้า\nเท่ากับ\nเท่าใด\nเท่าตัว\nเท่าทัน\nเท่าทุน\nเท่าเทียม\nเท่านั้น\nเท่าไร\nเท้าช้าง\nเทิดทูน\nเที่ยงตรง\nเที่ยงแท้\nเที่ยงธรรม\nเทียนชนวน\nเทียนพรรษา\nเทียบเคียง\nเทียบเท่า\nเทือกเขา\nเทือกเถา\nแท็งก์น้ำ\nแท่นพิมพ์\nแท่นมณฑล\nแท่นหมึก\nแทรกซอน\nแทรกซ้อน\nแทรกซึม\nแทรกแซง\nแทะโลม\nไทยดำ\nไทยทาน\nไทยธรรม\nไทยน้อย\nไทยใหญ่\nธงชัย\nธงชาติ\nธงทิว\nธรณีวิทยา\nธรณีสงฆ์\nธรรมกาย\nธรรมการ\nธรรมเกษตร\nธรรมขันธ์\nธรรมคุณ\nธรรมจรรยา\nธรรมจริยา\nธรรมจักร\nธรรมจักษุ\nธรรมจาคะ\nธรรมจารี\nธรรมชาติ\nธรรมดา\nธรรมเนียม\nธรรมราชา\nธรรมศาสตร์\nธรรมสภา\nธรรมสังเวช\nธัญพืช\nธารพระกร\nธีรภาพ\nธีรราช\nนกเขา\nนกต่อ\nนกยูง\nนกรู้\nนกหวีด\nนครบาล\nนครรัฐ\nนงคราญ\nนงนุช\nนงพะงา\nนงเยาว์\nนงราม\nนงลักษณ์\nนบนอบ\nนพเก้า\nนพคุณ\nนพเคราะห์\nนพปฎล\nนพพล\nนพรัตน์\nนพศก\nนพศูล\nนมข้น\nนมผง\nนมไม้\nนมนาน\nนมหนู\nนมแมว\nนรีเวช\nนรีเวชวิทยา\nนวดฟั้น\nนวยนาด\nนวลระหง\nนวลลออ\nนวลละออง\nนวลจันทร์\nนอกครู\nนอกคอก\nนอกจาก\nนอกใจ\nนอกชาน\nนอกรีต\nนอกเหนือ\nนองเนือง\nนองเลือด\nนอนก้น\nนอนใจ\nนอบนบ\nนอนเล่น\nนอบน้อม\nน้อมนำ\nน้อยใจ\nน้อยหน้า\nนักการ\nนักการเมือง\nนักกีฬา\nนักข่าว\nนักท่องเที่ยว\nนักเทศน์\nนักโทษ\nนักธรรม\nนักบวช\nนักบิน\nนักบุญ\nนักปราชญ์\nนักพรต\nนักรบ\nนักเรียน\nนักเลง\nนักวิชาการ\nนักศึกษา\nนักสิทธิ์\nนักสืบ\nนักหนา\nนั่งเทียน\nนั่งร้าน\nนัดแนะ\nนัดหมาย\nนั่นแหละ\nนั่นเอง\nนับถือ\nนับประสา\nนัยน์ตา\nนาดำ\nนาปรัง\nนาปี\nนาสวน\nนาหว่าน\nนาคบาศ\nนาคปรก\nนาคราช\nนางกวัก\nนางกำนัล\nนางงาม\nนางใน\nนางบำเรอ\nนางแบบ\nนางพญา\nนางฟ้า\nนางไม้\nนางโลม\nนางสาว\nนางห้าม\nนางเอก\nนาฏกรรม\nนาฏดนตรี\nนาฏศิลป์\nนานนม\nน่านน้ำ\nน่านฟ้า\nนามกร\nนามธรรม\nนามไธย\nนามบัตร\nนามปากกา\nนามแฝง\nนามสกุล\nนามสงเคราะห์\nนามสมญา\nนายทะเบียน\nนายท่า\nนายท้าย\nนายทุน\nนายประกัน\nนายหน้า\nนายอำเภอ\nนารายณ์หัตถ์\nนารีผล\nนาวิกโยธิน\nนำจับ\nนำพา\nนำทาง\nนำร่อง\nนำสืบ\nนำแสดง\nน้ำกรด\nน้ำกาม\nน้ำเกลือ\nน้ำข้าว\nน้ำแข็ง\nน้ำแข็งไส\nน้ำแข็งแห้ง\nน้ำครำ\nน้ำคร่ำ\nน้ำค้าง\nน้ำค้างแข็ง\nน้ำคาวปลา\nน้ำคำ\nน้ำเค็ม\nน้ำเคย\nน้ำเงิน\nน้ำเงี้ยว\nน้ำจัณฑ์\nน้ำจิ้ม\nน้ำใจ\nน้ำเชื้อ\nน้ำเชื่อม\nน้ำซาวข้าว\nน้ำดอกไม้\nน้ำดี\nน้ำตก\nน้ำตา\nน้ำตาล\nน้ำท่า\nน้ำนม\nน้ำนวล\nน้ำบาดาล\nน้ำประสานทอง\nน้ำประปา\nน้ำปลา\nน้ำป่า\nน้ำผึ้ง\nน้ำพริก\nน้ำพริกเผา\nน้ำพี้\nน้ำพุ\nน้ำมนต์\nน้ำมนตร์\nน้ำมัน\nน้ำมือ\nน้ำมูก\nน้ำเมา\nน้ำย่อย\nน้ำยา\nน้ำรัก\nน้ำแร่\nน้ำลาย\nน้ำเลี้ยง\nน้ำสต๊อก\nน้ำส้ม\nน้ำส้มสายชู\nน้ำสังข์\nน้ำสาบาน\nน้ำเสียง\nน้ำหนวก\nน้ำหนอง\nน้ำหนัก\nน้ำหน้า\nน้ำหนึ่ง\nน้ำหมึก\nน้ำหอม\nน้ำเหลือง\nน้ำอบ\nน้ำอ้อย\nน้ำอัดลม\nนิ่งเฉย\nนิจศีล\nนิดเดียว\nนิดหน่อย\nนิติกร\nนิติกรรม\nนิติธรรม\nนิตินัย\nนิติบัญญัติ\nนิติบุคคล\nนิติภาวะ\nนิติวิทยาศาสตร์\nนิติเวช\nนิติเวชศาสตร์\nนิติศาสตร์\nนิเทศศาสตร์\nนิ่มนวล\nนิรุกติศาสตร์\nนิเวศวิทยา\nนิศากร\nนิศากาล\nนิศาชล\nนิศารัตน์\nนี่แน่ะ\nนี่แหละ\nนี่เอง\nนึกคิด\nนุงถุง\nนุ่งห่ม\nนุ่มนวล\nนุ่มนิ่ม\nเนตรนารี\nเนติบัณฑิต\nเนยเทียม\nเนยใส\nเนิ่นนาน\nเนิบนาบ\nเนื้อความ\nเนื้อคู่\nเนื้อเค็ม\nเนื้องอก\nเนื้อตัว\nเนื้อตาย\nเนื้อที่\nเนื้อแท้\nเนื้อเปื่อย\nเนื้อผ้า\nเนื้อเพลง\nเนื้อไม้\nเนื้อเยื่อ\nเนื้อร้อง\nเนื้อร้าย\nเนื้อเรื่อง\nเนื้อหา\nเนืองนอง\nเนืองนิตย์\nเนืองแน่น\nแน่ใจ\nแน่ชัด\nแน่แท้\nแน่นอน\nแน่นิ่ง\nแน่แน่ว\nแน่นแฟ้น\nแน่นหนา\nแนบเนียน\nแนบแน่น\nแนวคิด\nแนวทาง\nแนวโน้ม\nแนวป่า\nแนวรบ\nแนวร่วม\nแนวหน้า\nแนวหลัง\nแน่วแน่\nแนะนำ\nแนะแนว\nโน้มน้าว\nในหลวง\nบกพร่อง\nบงกช\nบงการ\nบดบัง\nบทกลอน\nบทกวี\nบทความ\nบทคัดย่อ\nบทเฉพาะกาล\nบทนำ\nบทบัญญัติ\nบทบาท\nบทประพันธ์\nบทเพลง\nบทร้อง\nบทเรียน\nบทลงโทษ\nบทสนทนา\nบทอัศจรรย์\nบทจร\nบทบงสุ์\nบทมาลย์\nบทรัช\nบทเรศ\nบทวลัญช์\nบนบาน\nบรมครู\nบรมธาตุ\nบรมบพิตร\nบรมวงศานุวงศ์\nบรมอัฐิ\nบรรณพิภพ\nบรรณศาลา\nบรรณาการ\nบรรณาธิการ\nบรรณานุกรม\nบรรณารักษ์\nบรรณารักษศาสตร์\nบรรดามี\nบรรดาศักดิ์\nบรรทัดฐาน\nบรรพบุรุษ\nบรรลัยกัลป์\nบรรลัยจักร\nบริคณห์สนธิ\nบวงสรวง\nบ่วงบาศ\nบ้วนพระโอษฐ์\nบ่อเกิด\nบอกกล่าว\nบอกบท\nบอกบุญ\nบอกใบ้\nบอกปัด\nบ้องกัญชา\nบ้องตื้น\nบ้องไฟ\nบ้องหู\nบอดสี\nบ่อนทำลาย\nบอบช้ำ\nบอบบาง\nบอบแบบ\nบังโกลน\nบังโคลน\nบังใบ\nบั้งไฟ\nบังคับการ\nบังคับบัญชา\nบัญชาการ\nบัณฑุกัมพล\nบัดดล\nบัดเดี๋ยว\nบัดนั้น\nบัดนี้\nบัดสีบัดเถลิง\nบัตรเครดิต\nบัตรพลี\nบัตรสนเท่ห์\nบัตรสินเชื่อ\nบั่นทอน\nบั้นท้าย\nบั้นปลาย\nบั้นพระองค์\nบั้นเอว\nบันไดลิง\nบันไดเลื่อน\nบันเทิงคดี\nบัวลอย\nบัวบก\nบ้าจี้\nบ้าดีเดือด\nบ้าน้ำลาย\nบ้าบิ่น\nบ้าระห่ำ\nบ้าเลือด\nบ้าหอบฟาง\nบากบั่น\nบากหน้า\nบางตา\nบางเบา\nบางที\nบาดเจ็บ\nบาดแผล\nบาดหมาง\nบาตรใหญ่\nบาทบงกช\nบาทบงสุ์\nบาทบริจาริกา\nบาทวิถี\nบานเกล็ด\nบานตะเกียง\nบานตะไท\nบานเบอะ\nบานปลาย\nบานแผละ\nบานพับ\nบ้านจัดสรร\nบ้านช่อง\nบ้านนอก\nบ้านพัก\nบ้านเมือง\nบ้านรับรอง\nบ้านเรือน\nบาปกรรม\nบายศรี\nบ่ายเบี่ยง\nบ่ายหน้า\nบ่าวไพร่\nบิดเบี้ยว\nบิดเบือน\nบิดพลิ้ว\nบี้แบน\nบีบคั้น\nบีบรัด\nบึ้งตึง\nบึ้งบูด\nบุกบั่น\nบุกเบิก\nบุกรุก\nบุคลิกภาพ\nบุคลิกลักษณะ\nบุญธรรม\nบุญนิธิ\nบุญฤทธิ์\nบุบสลาย\nบุ้ยใบ้\nบุรุษเพศ\nบุหงารำไป\nบู้บี้\nบูชายัญ\nบูดบึ้ง\nบูดเบี้ยว\nเบาความ\nเบาใจ\nเบาบาง\nเบาปัญญา\nเบามือ\nเบาแรง\nเบาสมอง\nเบาหวาน\nเบาโหวง\nเบ้าตา\nเบาะแส\nเบิกความ\nเบิกบาน\nเบี้ยล่าง\nเบี้ยเลี้ยง\nเบี้ยหวัด\nเบี่ยงบ่าย\nเบียดบัง\nเบียดเบียน\nเบียดเสียด\nเบื้องต้น\nเบื้องบน\nเบื้องหน้า\nเบื้องหลัง\nแบกะดิน\nแบเบาะ\nแบ่งเบา\nแบ่งปัน\nแบ่งแยก\nแบบฉบับ\nแบบแปลน\nแบบแผน\nแบบฝึกหัด\nแบบพิมพ์\nแบบสอบถาม\nแบบอย่าง\nแบะแฉะ\nแบะท่า\nโบแดง\nโบราณคดี\nโบราณวัตถุ\nโบราณสถาน\nใบขับขี่\nใบจอง\nใบตอง\nใบแทรก\nใบบอก\nใบบุญ\nใบเบิกทาง\nใบปลิว\nใบพัด\nใบโพ\nใบไม้\nใบระกา\nใบรับรอง\nใบลา\nใบเลี้ยง\nใบสั่ง\nใบสำคัญ\nใบสุทธิ\nใบเสร็จ\nใบหน้า\nใบอนุญาต\nใบระกา\nปกครอง\nปกคลุม\nปกป้อง\nปกปิด\nปฏิบัติการ\nปฏิบัติบูชา\nปฐพีวิทยา\nปฐมฌาน\nปฐมทัศน์\nปฐมเทศนา\nปฐมนิเทศ\nปฐมพยาบาล\nปฐมยาม\nปฐมฤกษ์\nปฐมวัย\nปฐมสมโพธิ\nปนเป\nป่นปี้\nปมเขื่อง\nปมเด่น\nปมด้อย\nปรนเปรอ\nปรบไก่\nปรบมือ\nปรสิตวิทยา\nประโปรย\nประพรม\nประกันชีวิต\nประกันภัย\nประจักษ์พยาน\nประจัญบาน\nประจันหน้า\nประจำการ\nประจำเดือน\nประจำเมือง\nประจำยาม\nประชดประชัน\nประชากร\nประชากรศาสตร์\nประชาคม\nประชาชน\nประชาราษฎร์\nประชาชาติ\nประชาชี\nประชาทัณฑ์\nประชาบาล\nประชาพิจารณ์\nประชาภิบาล\nประชามติ\nประชาสงเคราะห์\nประชาสัมพันธ์\nประดับประดา\nประดามี\nประดาน้ำ\nประเดี๋ยวเดียว\nประเดี๋ยวนี้\nประทับใจ\nประทุษร้าย\nประเทศราช\nประพาสต้น\nประเพณีนิยม\nประลัยกัลป์\nประวัติการณ์\nประวัติศาสตร์\nประสบการณ์\nประสบการณ์นิยม\nประสาทการ\nประสูติการ\nประสูติกาล\nประเส\nปรับทุกข์\nปรับโทษ\nปรับปรุง\nปรากฏการณ์\nปราดเปรียว\nปราดเปรื่อง\nปราบปราม\nปริญญาบัตร\nปรัยัติธรรม\nปรุโปร่ง\nปลงใจ\nปลงตก\nปลดทุกข์\nปลดปลง\nปลดปล่อย\nปลดเปลื้อง\nปลดระวาง\nปลดแอก\nปล้นสะดม\nปลอกกระสุน\nปลอกคอ\nปลอดโปร่ง\nปลอดภัย\nปลอมปน\nปลอมแปลง\nปลอบโยน\nปล่อยใจ\nปล่อยตัว\nปล่อยปละ\nปลั๊กไฟ\nปลากริม\nปลาเค็ม\nปลาจ่อม\nปลาเจ่า\nปลาแดก\nปลาตู้\nปลาทอง\nปลาร้า\nปลาส้ม\nปลาดาว\nปลาบิน\nปลาฝา\nปลาวาฬ\nปลาหมึก\nปลาบปลื้ม\nปลายข้าว\nปลายแถว\nปลายทาง\nปลิ้นปลอก\nปลิ้นปล้อน\nปลีกตัว\nปลีกย่อย\nปลุกใจ\nปลุกปล้ำ\nปลุกปั่น\nปลุกระดม\nปลุกเสก\nปลูกฝัง\nปลูกสร้าง\nปวดถ่วง\nปวดมวน\nปวดร้าว\nป่วนปั่น\nป่วยการ\nปอกลอก\nป้องกัน\nปักใจ\nปักดำ\nปักหลัก\nปัจเจกบุคคล\nปัจเจกพุทธะ\nปัจเจกโพธิ\nปัจฉิมชน\nปัจฉิมทิศ\nปัจฉิมภาค\nปัจฉิมยาม\nปัจฉิมลิขิต\nปัจฉิมวัย\nปัจฉิมวาจา\nปัญญาชน\nปัญญาวิมุติ\nปัญญาอ่อน\nปัดเป่า\nปันส่วน\nปั่นป่วน\nปั่นแปะ\nปั่นหัว\nปั้นจิ้ม\nปั้นเจ๋อ\nปั้นปึ่ง\nปั้นสิบ\nปั๊มน้ำมัน\nป่าช้า\nป่าชายเลน\nป่าดง\nป่าดงดิบ\nป่าดิบ\nป่าเถื่อน\nป่าเบญจพรรณ\nป่าละเมาะ\nปากกา\nปากขอ\nปากแข็ง\nปากคอ\nปากคำ\nปากคีบ\nปากจัด\nปากน้ำ\nปากเปล่า\nปากเสียง\nปานกลาง\nป่านนี้\nป้านลม\nป้ายสี\nป่าวร้อง\nปิดฉาก\nปิดบัง\nปิตุฆาต\nปิตุภูมิ\nปีมะโว้\nปีแสง\nปี่กลาง\nปี่ไฉน\nปี่ชวา\nปี่นอก\nปี่ใน\nปี่พาทย์\nปี่อ้อ\nปีกกา\nปีนเกลียว\nปีนป่าย\nปึกแผ่น\nปึงปัง\nปืนกล\nปืนครก\nปืนพก\nปืนยา\nปืนยาว\nปืนลม\nปืนเล็ก\nปืนเล็กยาว\nปืนสั้น\nปืนใหญ่\nปุบปับ\nปุ๊บปั๊บ\nปุ่มเปือก\nปุยฝ้าย\nปุ๋ยคอก\nปุ๋ยเคมี\nปุ๋ยวิทยาศาสตร์\nปุ๋ยหมัก\nปุ๋ยอินทรีย์\nปูจ๋า\nปูเสฉวน\nปู่เจ้า\nปู่ทวด\nปูนขาว\nปูนซีเมนต์\nปูนดิบ\nปูนแดง\nปูนปลาสเตอร์\nปูนปั้น\nเป็ดเทศ\nเป็ดน้ำ\nเป็นกลาง\nเป็นใจ\nเป็นต้น\nเป็นต่อ\nเป็นรอง\nเป็นไร\nเป็นลม\nเป็นห่วง\nเป็นอยู่\nเปรมปรีดิ์\nเปรอะเปื้อน\nเปรียบเทียบ\nเปรียบเปรย\nเปรี้ยวปาก\nเปรี้ยวหวาน\nเปรื่องปราด\nเปลญวน\nเปล่งปลั่ง\nเปล่าดาย\nเปล่าเปลี่ยว\nเปลี่ยนใจ\nเปลี่ยนตัว\nเปลี่ยนแปลง\nเปลี่ยนมือ\nเปลี่ยนหน้า\nเป๋อเหลอ\nเปะปะ\nเป่ากบ\nเป้านิ่ง\nเป้าหมาย\nเปิดฉาก\nเปิดเปิง\nเปิดโปง\nเปิดผนึก\nเปิดเผย\nเปียกปูน\nแป้งสาลี\nแป้งนวล\nแป้งเปียก\nแป้งมัน\nแป้งฝุ่น\nแป้งร่ำ\nแป้งสิงคโปร์\nแป้งหมี่\nแปดปน\nแปดเปื้อน\nแปรปรวน\nแปรผัน\nแปรพักตร์\nแปรรูป\nแปรอักษร\nแปลกปลอม\nแปะโป้ง\nโป้ปด\nโปร่งแสง\nโปร่งใส\nโปรดปราน\nโปรยทาน\nโปรยปราย\nโปโลน้ำ\nผกผัน\nผกากรอง\nผงขาว\nผงชูรส\nผงซักฟอก\nผงฟู\nผดุงครรภ์\nผมไฟ\nผลพลอยได้\nผลลัพธ์\nผลัดเปลี่ยน\nผลิตผล\nผลิตภัณฑ์\nผลุบโผล่\nผสมเทียม\nผสมผสาน\nผสมผเส\nผสมพันธุ์\nผสมโรง\nผสมเสร็จ\nผ่องแผ้ว\nผ่องใส\nผ่อนคลาย\nผ่อนชำระ\nผ่อนปรน\nผ่อนผัน\nผ่อนส่ง\nผอมโซ\nผอมแห้ง\nผักชี\nผักตบชวา\nผักบุ้ง\nผังเมือง\nผัดผ่อน\nผันแปร\nผันผวน\nผ่าตัด\nผ่าเผย\nผ่าหมาก\nผ่าเหล่า\nผ้าขนหนู\nผ้าขาวม้า\nผ้าขี้ริ้ว\nผ้าเช็ดตัว\nผ้าเช็ดปาก\nผ้าเช็ดมือ\nผ้าเช็ดหน้า\nผ้าดิบ\nผ้าต่วน\nผ้าไตร\nผ้าถุง\nผ้าแถบ\nผ้านวม\nผ้านุ่ง\nผ้าใบ\nผ้าป่า\nผ้าป่าน\nผ้าผ่อน\nผ้าพันคอ\nผ้าพันแผล\nผ้าแพร\nผ้าโพกหัว\nผ้ามัดหมี่\nผ้ายาง\nผ้าลูกไม้\nผ้าเหลือง\nผ้าอนามัย\nผ้าอ้อม\nผาดโผน\nผาติกรรม\nผิดหวัง\nผิวเผิน\nผิวพรรณ\nผิวหนัง\nผีกระสือ\nผีกระหัง\nผีกองกอย\nผีโขมด\nผีดิบ\nผีตองเหลือง\nผีถ้วยแก้ว\nผีแถน\nผีทะเล\nผีบุญ\nผีปอบ\nผีพุ่งไต้\nผีฟ้า\nผีเรือน\nผีสาง\nผีเสื้อ\nผีห่า\nผึ่งผาย\nผุดผ่อง\nผุดผาด\nผู้คน\nผู้คุม\nผู้จัดการ\nผู้ชาย\nผู้เชี่ยวชาญ\nผู้ดี\nผู้โดยสาร\nผู้ต้องขัง\nผู้ต้องหา\nผู้แทน\nผู้น้อย\nผู้บริโภค\nผู้บังคับบัญชา\nผู้ปกครอง\nผู้ประกอบการ\nผู้ป่วย\nผู้พิพากษา\nผู้เยาว์\nผู้ร้าย\nผู้วิเศษ\nผู้สื่อข่าว\nผู้เสียหาย\nผู้หญิง\nผู้ใหญ่\nผู้ใหญ่บ้าน\nผูกขวัญ\nผูกขาด\nผูกพัน\nผูกมัด\nเผชิญหน้า\nเผด็จการ\nเผด็จศึก\nเผยแผ่\nเผยแพร่\nเผละผละ\nเผ่าพันธุ์\nเผื่อแผ่\nแผงลอย\nแผนการ\nแผนงาน\nแผนที่\nแผนผัง\nแผนภาพ\nแผนภูมิ\nแผ่นดิน\nแผ่นเสียง\nแผ้วพาน\nโผงผาง\nฝนทอง\nฝอยทอง\nฝักแค\nฝักบัว\nฝักฝ่าย\nฝักใฝ่\nฝังใจ\nฝังหัว\nฝาชี\nฝาแฝด\nฝาละมี\nฝ่าพระบาท\nฝ่าฝืน\nฝ่าฟัน\nฝ้าฟาง\nฝากตัว\nฝากฝัง\nฝีดาษ\nฝีมะม่วง\nฝีจักร\nฝีเท้า\nฝีปาก\nฝีพาย\nฝีมือ\nฝีเย็บ\nฝึกงาน\nฝึกปรือ\nฝึกฝน\nฝึกสอน\nฝึกหัด\nฝืดเคือง\nใฝ่ฝัน\nพงพี\nพงศ์พันธุ์\nพญาโศก\nพญาไฟ\nพบปะ\nพบพาน\nพรสวรรค์\nพรมคด\nพรมแดน\nพรมมิ\nพรรคพวก\nพรรณราย\nพรวดพราด\nพรหมชาติ\nพรหมลิขิต\nพรหมโลก\nพรหมวิหาร\nพร้อมใจ\nพร้อมพรั่ง\nพร้อมเพรียง\nพร้อมมูล\nพร้อมสรรพ\nพร้อมหน้า\nพระครู\nพระคุณ\nพระเคราะห์\nพระเครื่อง\nพระเจ้า\nพระเจ้าอยู่หัว\nพระชายา\nพระทัย\nพระนาง\nพระนางเจ้า\nพระเป็นเจ้า\nพระผู้เป็นเจ้า\nพระพิมพ์\nพระพุทธเจ้า\nพระพุทธองค์\nพระภูมิ\nพระยา\nพระรอง\nพระสนม\nพระสนมเอก\nพระองค์\nพระองค์เจ้า\nพระเอก\nพรั่งพร้อม\nพรั่งพรู\nพรั่นพรึง\nพร่างพราว\nพรายน้ำ\nพรายแพรว\nพราวแพรว\nพร่ำพลอด\nพร่ำเพรื่อ\nพร่ำเพ้อ\nพริกไทย\nพริ้งพราย\nพริ้งเพรา\nพริ้งเพริศ\nพริบตา\nพริ้มพราย\nพริ้มเพรา\nพรุ่งนี้\nพฤติกรรม\nพฤติการณ์\nพฤตินัย\nพลการ\nพลขับ\nพลความ\nพลเมือง\nพลรบ\nพลร่ม\nพลเรือน\nพลโลก\nพลศึกษา\nพลบค่ำ\nพลอดรัก\nพลังงาน\nพลังเงียบ\nพลังจิต\nพลั้งปาก\nพลั้งเผลอ\nพลั้งพลาด\nพลัดถิ่น\nพลัดพราก\nพลาดท่า\nพลาดพลั้ง\nพลิกแพลง\nพลีกรรม\nพลุ่งพล่าน\nพวกพ้อง\nพวงมาลัย\nพวงมาลา\nพวงหรีด\nพวงคราม\nพวงชมพู\nพวงแสด\nพ่วงพี\nพวยน้ำ\nพวยพุ่ง\nพสกนิกร\nพหุคูณ\nพหุภาคี\nพหูพจน์\nพหูสูต\nพอควร\nพอใจ\nพอใช้\nพอใช้ได้\nพอดี\nพอตัว\nพอทำเนา\nพอประมาณ\nพอเพียง\nพอแรง\nพอสมควร\nพอเหมาะ\nพ่อขุน\nพ่อครัว\nพ่อตา\nพ่อบ้าน\nพ่อพันธุ์\nพ่อม่าย\nพ่อเมือง\nพ่อเลี้ยง\nพ่อสื่อ\nพอกพูน\nพ้องพาน\nพักผ่อน\nพักพิง\nพักฟื้น\nพักร้อน\nพักแรม\nพัดยศ\nพัดลม\nพันพัว\nพับฐาน\nพับเพียบ\nพัวพัน\nพาซื่อ\nพาดพิง\nพิณพาทย์\nพิธีกร\nพิธีกรรม\nพิธีการ\nพิธีรีตอง\nพิธีสาร\nพินัยกรรม\nพิมพ์เขียว\nพิมพ์ใจ\nพิมพ์ดีด\nพิษสง\nพี่น้อง\nพี่เบิ้ม\nพี่เลี้ยง\nพึงใจ\nพึงพอใจ\nพึ่งพา\nพึ่งพิง\nพืชพันธุ์\nพืชมงคล\nพื้นฐาน\nพื้นที่\nพื้นบ้าน\nพื้นเพ\nพื้นเมือง\nพื้นเสีย\nพุพอง\nพุทธกาล\nพุทธคุณ\nพุทธจักร\nพุทธเจดีย์\nพุทธฎีกา\nพุทธปฏิมา\nพุทธปฏิมากร\nพุทธมามกะ\nพุทธศักราช\nพุทธศาสนิกชน\nพุทธองค์\nพุทธชาด\nพุทธรักษา\nพุ่มพวง\nพุ่มไม้\nพู่กัน\nพูดจา\nเพ่งเล็ง\nเพดานบิน\nเพดานปาก\nเพริศพราย\nเพริศพริ้ง\nเพริศแพร้ว\nเพรียกพร้อง\nเพรียวลม\nเพลงเชิด\nเพลงยาว\nเพลิงกัลป์\nเพลินใจ\nเพลินตา\nเพลี่ยงพล้ำ\nเพ้อฝัน\nเพาะกาย\nเพาะชำ\nเพาะปลูก\nเพิกถอน\nเพิกเฉย\nเพิ่มเติม\nเพิ่มพูน\nเพียงตา\nเพียงพอ\nเพียบแประ\nเพียบพร้อม\nเพื่อนเกลอ\nเพื่อนตาย\nเพื่อนบ้าน\nเพื่อนฝูง\nเพื่อนยาก\nแพ้ท้อง\nแพร่หลาย\nแพร่งพราย\nแพรวพราว\nโพธิญาณ\nโพธิบัลลังก์\nโพธิสมภาร\nโพธิสัตว์\nโพ้นทะเล\nโพยภัย\nไพ่ตาย\nไพ่ป๊อก\nไพรวัน\nไพรสณฑ์\nไพรสัณฑ์\nไพร่พล\nไพร่ฟ้า\nไพร่สม\nไพร่ส่วย\nไพร่หลวง\nฟกช้ำ\nฟองเต้าหู้\nฟองน้ำ\nฟองมัน\nฟ้องกลับ\nฟ้องร้อง\nฟอนเฟะ\nฟักทอง\nฟัดเฟียด\nฟันดาบ\nฟันฝ่า\nฟันแท้\nฟันน้ำนม\nฟันปลา\nฟันฟาง\nฟันเฟือง\nฟันม้า\nฟันเลื่อย\nฟันหนู\nฟั่นเฝือ\nฟั่นเฟือน\nฟื้นตัว\nฟื้นฝอย\nฟื้นฟู\nฟุ้งซ่าน\nฟุ้งเฟ้อ\nฟุ้งเฟื่อง\nฟุตบอล\nฟูฟ่อง\nฟูเฟื่อง\nฟูมฟัก\nฟูมฟาย\nเฟะฟะ\nเฟื่องฟ้า\nเฟื่องฟุ้ง\nเฟื่องฟู\nไฟฉาย\nไฟแช็ก\nไฟธาตุ\nไฟฟ้า\nภัตกิจ\nภาคทัณฑ์\nภาคพื้น\nภาคเรียน\nภาคภูมิ\nภาพถ่าย\nภาพนิ่ง\nภาพประกอบ\nภาพพจน์\nภาพยนตร์\nภาพลวงตา\nภาพลักษณ์\nภายนอก\nภายใน\nภายหน้า\nภายหลัง\nภารกิจ\nภารธุระ\nภารโรง\nภารตวิทยา\nภาษาศาสตร์\nภาสกร\nภิญโญภาพ\nภินชาติ\nภูธร\nภูธเรศ\nภูบาล\nภูเบศ\nภูเบศวร์\nภูเขา\nภูเขาไฟ\nภูผา\nภูตคาม\nภูตบดี\nภูตรูป\nภูเตศวร\nภูมินทร์\nภูมิบาล\nภูมิประเทศ\nภูมิภาค\nภูมิรัฐศาสตร์\nภูมิลำเนา\nภูมิศาสตร์\nภูมิอากาศ\nภูมิธรรม\nภูมิปัญญา\nภูมิรู้\nภูมิใจ\nภูมิฐาน\nภูมิคุ้มกัน\nภูมิแพ้\nภูษาโยง\nเภทภัย\nเภสัชกร\nเภสัชกรรม\nเภสัชวิทยา\nเภสัชศาสตร์\nโภคทรัพย์\nโภคภัณฑ์\nโภชนากร\nโภชนาการ\nมกุฎราชกุมาร\nมงคลแฝด\nมงคลสูตร\nมงคลหัตถี\nมณเฑียรบาล\nมดดำ\nมดแดง\nมดเท็จ\nมดยอบ\nมดลูก\nมธุปายาส\nมธุรส\nมนเทียรบาล\nมนุษย์กบ\nมโนกรรม\nมโนคติ\nมโนทุจริต\nมโนธรรม\nมโนภาพ\nมโนมัย\nมโนรถ\nมโนรมย์\nมโนสุจริต\nมรรคนายก\nมรรคผล\nมฤคชาติ\nมฤคทายวัน\nมฤคราช\nมลทิน\nมลพิษ\nมลสาร\nมวกเหล็ก\nม้วนหน้า\nมวยไทย\nมวยปล้ำ\nมวยล้ม\nมวยวัด\nมวยสากล\nมวยหมู่\nมวลสาร\nมอคราม\nมอซอ\nมอหมึก\nมองเมียง\nมอบตัว\nมอบหมาย\nมอมเมา\nมะขามเทศ\nมะขามป้อม\nมะขามเปียก\nมะเขือเทศ\nมะเขือพวง\nมะพร้าวแก้ว\nมักคุ้น\nมักจี่\nมักง่าย\nมักน้อย\nมักมาก\nมักใหญ่\nมั่งคั่ง\nมั่งมี\nมัจจุราช\nมัชฌิมนิกาย\nมัชฌิมประเทศ\nมัชฌิมยาม\nมัชฌิมวัย\nมัดจำ\nมัดหมี่\nมัธยมกาล\nมัธยมศึกษา\nมันแกว\nมันเทศ\nมันฝรั่ง\nมันเปลว\nมันสมอง\nมั่นคง\nมั่นใจ\nมั่นหมาย\nมั่นเหมาะ\nมัวเมา\nมัวหมอง\nมั่วสุม\nม้าเทศ\nม้าน้ำ\nม้ามืด\nม้าเร็ว\nม้าล่อ\nม้าลาย\nมากมาย\nมาตรการ\nมาตรฐาน\nมาตราส่วน\nมาตุคาม\nมาตุฆาต\nมาตุภูมิ\nม่านตา\nม่านบังตา\nมายากร\nมายากล\nมายาการ\nมายาวี\nมารผจญ\nมารวิชัย\nมารสังคม\nมารหัวขน\nมาลาการ\nมิ่งขวัญ\nมิ่งมิตร\nมิจฉาจาร\nมิจฉาชีพ\nมิดชิด\nมิดเมี้ยน\nมิดหมี\nมิตรจิต\nมิตรภาพ\nมิตรสหาย\nมิน่า\nมีหน้า\nมีดโกน\nมีดดาบ\nมีดโต้\nมีดพก\nมีดพับ\nมีดสั้น\nมึนงง\nมึนชา\nมึนตึง\nมึนเมา\nมืดครึ้ม\nมืดมน\nมืดมัว\nมือจับ\nมือดี\nมือเติบ\nมือปืน\nมือเปล่า\nมือมืด\nมือสอง\nมือเสือ\nมือหนึ่ง\nมือใหม่\nมุกตลก\nมุขปาฐะ\nมุขมนตรี\nมุ่งมั่น\nมุ่งมาด\nมุ่งหน้า\nมุ่งหมาย\nมุ่งหวัง\nมุ้งลวด\nมุ้งสายบัว\nมุมก้ม\nมุมกลับ\nมุมเงย\nมุมฉาก\nมุมตรง\nมุมป้าน\nมุมมืด\nมุมแย้ง\nมุมสะท้อน\nมุมหักเห\nมุมแหลม\nมุสาวาท\nมูกเลือด\nมูกมัน\nมูกหลวง\nมูนดิน\nมูลฐาน\nมูลนาย\nมูลนิธิ\nมูลเหตุ\nมูลค่า\nมูลฝอย\nเม็ดเงิน\nเม็ดเลือด\nเม็ดโลหิต\nเม่นทะเล\nเมรุมาศ\nเมรุราช\nเมล์อากาศ\nเมาดิบ\nเมามัน\nเมามัว\nเมามาย\nเมินเฉย\nเมียน้อย\nเมียหลวง\nเมียงมอง\nเมี่ยงลาว\nเมี่ยงส้ม\nเมื่อกี้\nเมื่อตะกี้\nเมื่อใด\nเมื่อไร\nเมื่อไหร่\nเมื่อนั้น\nเมืองขึ้น\nเมืองท่า\nเมืองนอก\nเมืองหลวง\nเมื่อยขบ\nเมื่อยล้า\nแม่กอง\nแม่กุญแจ\nแม่คุณ\nแม่งาน\nแม่เจ้า\nแม่ชี\nแม่ทัพ\nแม่นม\nแม่น้ำ\nแม่บท\nแม่บ้าน\nแม่เบี้ย\nแม่พระ\nแม่พิมพ์\nแม่เพลง\nแม่มด\nแม่ม่าย\nแม่ไม้\nแม่ยก\nแม่ยาย\nแม่ร้าง\nแม่เรือน\nแม่แรง\nแม่เล้า\nแม่เลี้ยง\nแม่สี\nแม่สื่อ\nแม่เหล็ก\nแมงมุม\nแม่นยำ\nแมลงช้าง\nแมลงวัน\nแมลงปอ\nแมลงภู่\nแมลงเม่า\nแมวเซา\nแมวน้ำ\nแมวป่า\nแมวมอง\nไม้กลัด\nไม้กวาด\nไม้กางเขน\nไม้เกาหลัง\nไม้ขีดไฟ\nไม้จิ้มฟัน\nไม้เด็ด\nไม้ตาย\nไม้ตีพริก\nไม้ที\nไม้เท้า\nไม้บรรทัด\nไม้เมตร\nไม้ระแนง\nไม้เรียว\nไม้หมอน\nไม้อัด\nไม้จัตวา\nไม้ตรี\nไม้ไต่คู้\nไม้โท\nไม้ผัด\nไม้มลาย\nไม้ม้วน\nไม้ยมก\nไม้หน้า\nไม้หันอากาศ\nไม้เอก\nยกกลีบ\nยกครู\nยกเครื่อง\nยกเค้า\nยกทรง\nยกฟ้อง\nยกเมฆ\nยกยอ\nยกย่อง\nยกเลิก\nยกเว้น\nย่นย่อ\nยมทูต\nยมบาล\nยมราช\nยมโลก\nยวดยิ่ง\nยวดยาน\nยวนยี\nยวบยาบ\nย่อท้อ\nย่อส่วน\nย่อหน้า\nย่อหย่อน\nยอกย้อน\nยองใย\nย่องเบา\nย่องแย่ง\nยอดเยี่ยม\nยอดอก\nย้อนยอก\nย้อนรอย\nย้อนศร\nย้อนแสง\nย้อนหลัง\nยอบแยบ\nยอมความ\nย่อมเยา\nย่อยยับ\nยักยอก\nยักย้าย\nยักเยื้อง\nยัญกรรม\nยัญพิธี\nยัดเยียด\nยับเยิน\nยับยั้ง\nยั่วยวน\nยั่วยุ\nยั่วเย้า\nยากวาด\nยากันยุง\nยาเขียว\nยาใจ\nยาฉุน\nยาชา\nยาซัด\nยาดอง\nยาแดง\nยาถ่าย\nยาธาตุ\nยานัตถุ์\nยาเบื่อ\nยาโป๊\nยาแฝด\nยาพิษ\nยาระบาย\nยาสลบ\nยาสั่ง\nยาสีฟัน\nยาสูบ\nยาเส้น\nยาเสพติด\nยาหม่อง\nยาเหลือง\nย่าทวด\nย่านาง\nยากแค้น\nยากจน\nยากเย็น\nยากไร้\nยางนอก\nยางใน\nยางมะตอย\nยางมะตูม\nยางลบ\nยางสน\nยางอาย\nย่างกราย\nย่างเยื้อง\nย่างสด\nย่างสามขุม\nย่างเหยียบ\nยานเกราะ\nยานพาหนะ\nยานอวกาศ\nยานคาง\nยายทวด\nยาวเฟื้อย\nยาวยืด\nยาวเหยียด\nยำทวาย\nยำใหญ่\nยำเกรง\nยำเยง\nย่ำต๊อก\nย่ำยี\nย่ำแย่\nยิงเป้า\nยิ่งนัก\nยิ่งยวด\nยิ่งใหญ่\nยินดี\nยินยอม\nยินร้าย\nยิ้มกริ่ม\nยิ้มแฉ่ง\nยิ้มแต้\nยิ้มแป้น\nยิ้มเผล่\nยิ้มเยาะ\nยิ้มแย้ม\nยียวน\nยึดครอง\nยึดถือ\nยึดมั่น\nยึดเหนี่ยว\nยืดยาด\nยืดยาว\nยืดเยื้อ\nยืดหยุ่น\nยืดอก\nยืนกราน\nยืนต้น\nยืนพื้น\nยืนยง\nยืนยัน\nยืนหยัด\nยื้อยุด\nยุยง\nยุแยง\nยุแหย่\nยุคลบาท\nยุคเข็ญ\nยุคทอง\nยุคมืด\nยุ่งขิง\nยุ่งยาก\nยุ่งเหยิง\nยุติธรรม\nยุทธการ\nยุทธนาวี\nยุทธปัจจัย\nยุทธภัณฑ์\nยุทธภูมิ\nยุทธวิธี\nยุทธศาสตร์\nยุทธหัตถี\nยุทธนาการ\nยุทธนาธิการ\nยุบยับ\nยุบยิบ\nยุพราช\nยู่ยี่\nเย็นเจี๊ยบ\nเย็นใจ\nเย็นฉ่ำ\nเย็นเฉียบ\nเย็นชา\nเย็นชืด\nเย็นตา\nเย็นเยียบ\nเย็นเยือก\nเย็นวาบ\nเย็นวูบ\nเย็บกี่\nเย็บจักร\nเย็บด้าย\nเย้ยหยัน\nเย้าหยอก\nเยาะเย้ย\nเยี่ยมกราย\nเยี่ยมเยียน\nเยี่ยมเยือน\nเยี่ยมยอด\nเยื่อเคย\nเยื่อใย\nเยือกเย็น\nเยื้องกราย\nเยื้องยัก\nเยื้องย่าง\nแยกย้าย\nแยกแยะ\nแย่งชิง\nแยบคาย\nแยบยล\nแย้มพราย\nแย้มยิ้ม\nแย้มสรวล\nโยเย\nโย้เย้\nโยกโคลง\nโยกย้าย\nโยกโย้\nโยนกลอง\nใยหิน\nรกชัฏ\nรกร้าง\nรกเรี้ยว\nรกเรื้อ\nรกราก\nรงควัตถุ\nรชนีกร\nรถกระบะ\nรถเก๋ง\nรถเข็น\nรถแข่ง\nรถจักร\nรถจี๊ป\nรถตู้\nรถทัวร์\nรถบรรทุก\nรถพ่วง\nรถพยาบาล\nรถไฟ\nรถไฟฟ้า\nรถม้า\nรถเมล์\nรถยนต์\nรถราง\nรถลาก\nรถสปอร์ต\nรถสิบล้อ\nรบกวน\nรบรา\nรบเร้า\nรมดำ\nร่มเกล้า\nร่มชูชีพ\nร่มเย็น\nร่มรื่น\nร่วงโรย\nรวงผึ้ง\nรวงรัง\nรวดเร็ว\nรวนเร\nรวบยอด\nรวบรวม\nรวบรัด\nรวมพล\nรวมหัว\nร่วมใจ\nร่วมเพศ\nร่วมมือ\nร่วมรัก\nร่วมสมัย\nรวยริน\nรวยรื่น\nรสชาติ\nรสนิยม\nรองท้อง\nรองเท้า\nรองพื้น\nร่องน้ำ\nร่องรอย\nร้องขอ\nร้องทุกข์\nร้องเรียน\nร้องห่ม\nร้องไห้\nรองช้ำ\nรองทรง\nรอดชีวิต\nรอดตัว\nรอดตาย\nรอนแรม\nร่อนเร่\nร้อนใจ\nร้อนตัว\nร้อนรน\nร้อนรุ่ม\nร้อนวิชา\nร้อนอาสน์\nรอบจัด\nรอบเดือน\nรอบรู้\nรอยร้าว\nร่อยหรอ\nร้อยละ\nร้อยกรอง\nร้อยแก้ว\nร้อยหวาย\nระนาดทุ้ม\nระนาดเอก\nระเบิดขวด\nระเบิดมือ\nระเบียบการ\nรักใคร่\nรักษาการ\nรักษาการณ์\nรังไข่\nรังแตน\nรังนก\nรังผึ้ง\nรังเพลิง\nรังมด\nรังสรรค์\nรังสฤษฏ์\nรั้งรอ\nรังสีแพทย์\nรังสีวิทยา\nรัชกาล\nรัชทายาท\nรัชนีกร\nรัฐธรรมนูญ\nรัฐบาล\nรัฐบุรุษ\nรัฐประศาสน์\nรัฐประหาร\nรัฐพิธี\nรัฐมนตรี\nรัฐวิสาหกิจ\nรัฐศาสตร์\nรัฐสภา\nรัดกุม\nรัดเกล้า\nรัดตัว\nรัดประคด\nรัดรึง\nรัดรูป\nรัตติกาล\nรับขวัญ\nรับจ้าง\nรับช่วง\nรับใช้\nรับซื้อ\nรับทราบ\nรับประกัน\nรับประทาน\nรับปาก\nรับผิด\nรับผิดชอบ\nรับฟ้อง\nรับฟัง\nรับมือ\nรับรอง\nรับรู้\nรับสมัคร\nรับสั่ง\nรับหน้า\nรับเหมา\nรั่วไหล\nรามือ\nร่าเริง\nรากแก้ว\nรากขวัญ\nรากฐาน\nรากฟัน\nรากศัพท์\nรากเหง้า\nร่างกาย\nร่างแห\nร้างรา\nราชกรณียกิจ\nราชการ\nราชกิจ\nราชครู\nราชฐาน\nราชทัณฑ์\nราชทินนาม\nราชทูต\nราชธานี\nราชนาวี\nราชบัณฑิต\nราชบัลลังก์\nราชบาตร\nราชบุตร\nราชปะแตน\nราชภัฏ\nราชมัล\nราชยาน\nราชรถ\nราชลัญจกร\nราชเลขาธิการ\nราชเลขานุการ\nราชวงศ์\nราชวัติ\nราชสกุล\nราชสมบัติ\nราชสาส์น\nราชหัตถเลขา\nราชองครักษ์\nราชโองการ\nราชาคณะ\nราชาศัพท์\nราชินีนาถ\nร้านชำ\nร้านรวง\nราบคาบ\nราบรื่น\nราบเรียบ\nรายการ\nรายงาน\nรายจ่าย\nรายได้\nรายทาง\nรายรับ\nรายล้อม\nรายละเอียด\nรายวิชา\nร่ายยาว\nร่ายรำ\nร้ายกาจ\nร้ายแรง\nราวนม\nราวป่า\nร้าวฉาน\nร้าวราน\nรำพัด\nรำแพน\nรำวง\nร่ำไป\nร่ำร้อง\nร่ำเรียน\nร่ำไร\nร่ำลา\nร่ำไห้\nริเริ่ม\nริอ่าน\nริมฝีปาก\nริ้วรอย\nรีบร้อน\nรีบรุด\nรีบเร่ง\nรื่นรมย์\nรื่นเริง\nรื้อถอน\nรื้อฟื้น\nรุกฆาต\nรุกราน\nรุกล้ำ\nรุกไล่\nรุ่งขึ้น\nรุ่งแจ้ง\nรุ่งเช้า\nรุ่งเรือง\nรุ่งโรจน์\nรุ่งสว่าง\nรุ่งสาง\nรุ่งอรุณ\nรุจิเรข\nรุดหน้า\nรุนแรง\nรุมเร้า\nรุมล้อม\nรุ่มรวย\nรุ่มร้อน\nรุ่ยร่าย\nรู้แกว\nรู้ความ\nรู้คุณ\nรู้งาน\nรู้จัก\nรู้แจ้ง\nรู้ใจ\nรู้เชิง\nรู้ตัว\nรู้ทัน\nรู้เท่า\nรู้เรื่อง\nรู้สำนึก\nรู้สึก\nรู้เห็น\nรูปการณ์\nรูปโฉม\nรูปฌาน\nรูปถ่าย\nรูปทรง\nรูปธรรม\nรูปแบบ\nรูปพรรณ\nรูปพรหม\nรูปภพ\nรูปภาพ\nรูปร่าง\nรูปสมบัติ\nเร่ร่อน\nเร่งด่วน\nเร่งมือ\nเร่งรัด\nเร่งรีบ\nเร่งเร้า\nเร้นลับ\nเร่อร่า\nเร่าร้อน\nเราะราย\nเราะร้าย\nเริงใจ\nเริงรมย์\nเริดร้าง\nเริ่มต้น\nเริ่มแรก\nเรี่ยราด\nเรี่ยไร\nเรียกคืน\nเรียกตัว\nเรียกร้อง\nเรียกหา\nเรียบร้อย\nเรียงความ\nเรียงตัว\nเรียงเบอร์\nเรียงพิมพ์\nเรียงเม็ด\nเรียงราย\nเรียนรู้\nเรียบร้อย\nเรียบเรียง\nเรียบวุธ\nเรี่ยมเร้\nเรี่ยวแรง\nเรือกลไฟ\nเรือกอและ\nเรือกำปั่น\nเรือจ้าง\nเรือดำน้ำ\nเรือโดยสาร\nเรือตรวจการณ์\nเรือตังเก\nเรือธง\nเรือนำร่อง\nเรือบด\nเรือบิน\nเรือใบ\nเรือประมง\nเรือพ่วง\nเรือพิฆาต\nเรือยนต์\nเรือยาว\nเรือโยง\nเรือรบ\nเรือลากจูง\nเรือสำปั้น\nเรือสำเภา\nเรือหลวง\nเรือหางยาว\nเรืออีโปง\nเรือเอี้ยมจุ๊น\nเรื้อรัง\nเรือกสวน\nเรืองนาม\nเรืองรอง\nเรืองแสง\nเรื่องราว\nเรื่องสั้น\nเรือนแก้ว\nเรือนจำ\nเรือนเบี้ย\nเรือนแพ\nเรือนหอ\nเรื่อยเจื้อย\nเรื่อยเฉื่อย\nเรื่อยเปื่อย\nแรเงา\nแรกนา\nแรกนาขวัญ\nแรงงาน\nแรงดึงดูด\nแรงเทียน\nแรงม้า\nแรงเหวี่ยง\nแรมรอน\nแรมรา\nแรมโรย\nโรคจิต\nโรงครัว\nโรงงาน\nโรงเจ\nโรงเตี๊ยม\nโรงทาน\nโรงนา\nโรงพยาบาล\nโรงพัก\nโรงพิมพ์\nโรงเรียน\nโรงเรือน\nโรงแรม\nโรงเลี้ยง\nโรงเลื่อย\nโรงสี\nโรงสีข้าว\nโรงอาหาร\nโรมรัน\nโรยรา\nฤชากร\nฤดูกาล\nลงขัน\nลงแขก\nลงคอ\nลงตัว\nลงท้าย\nลงทุน\nลงโทษ\nลงพุง\nลงมือ\nลงรัก\nลงรอย\nลงแรง\nลงโรง\nลงเอย\nลดตัว\nลดละ\nลดเลี้ยว\nลดหย่อน\nลดหลั่น\nลนลาน\nล้นพ้น\nล้นหลาม\nล้นเหลือ\nลบล้าง\nลบเลือน\nลบหลู่\nลมกรด\nลมค้า\nลมงวง\nลมแดด\nลมทะเล\nลมบก\nลมบน\nลมบ้าหมู\nลมปราณ\nลมปาก\nลมพิษ\nลมว่าว\nลมเสีย\nลมหนาว\nลมหายใจ\nล่มจม\nล่มสลาย\nล้มละลาย\nล้มลุก\nล้มเลิก\nล่วงเกิน\nล่วงรู้\nล่วงละเมิด\nล่วงลับ\nล่วงล้ำ\nล่วงเลย\nล่วงหน้า\nลวดลาย\nลวดสปริง\nลวดหนาม\nล้วนแล้ว\nลหุโทษ\nล่อลวง\nล่อหลอก\nล่อแหลม\nล้อต๊อก\nล้อเลื่อน\nล้อเล่น\nล้อเลียน\nล้อหลอก\nลองเชิง\nลองดี\nลองภูมิ\nล่องหน\nลอดช่อง\nล่อนจ้อน\nลอบกัด\nล้อมวง\nลอยแก้ว\nลอยชาย\nลอยตัว\nลอยนวล\nลอยแพ\nลอยลำ\nละทิ้ง\nละเลย\nละเว้น\nละครนอก\nละครใน\nละครเพลง\nละครร้อง\nละครรำ\nละครลิง\nละครสัตว์\nละเอียดอ่อน\nลักไก่\nลักพา\nลักเพศ\nลักยิ้ม\nลักลอบ\nลักลั่น\nลักหลับ\nลัดเลาะ\nลับตา\nลับแล\nลับหลัง\nลาออก\nล่าช้า\nล่าทัพ\nล้าสมัย\nล้าหลัง\nลากข้าง\nล้างบาง\nล้างผลาญ\nลาดเขา\nลาดตระเวน\nลาดเท\nลาดยาง\nลานบิน\nลาภปาก\nลาภลอย\nลามปาม\nลามเลีย\nลายคราม\nลายเซ็น\nลายแทง\nลายน้ำ\nลายพร้อย\nลายมือ\nลายลักษณ์\nลายเส้น\nลำกล้อง\nลำแข้ง\nลำธาร\nลำแสง\nลำไส้\nลำตัด\nลำนำ\nล่ำสัน\nล้ำยุค\nล้ำสมัย\nล้ำลึก\nล้ำเลิศ\nล้ำเส้น\nล้ำหน้า\nลิงจุ่น\nลิงลม\nลิงโลด\nลิดรอน\nลิ้นไก่\nลิ้นชัก\nลิ้นปี่\nลิ้นควาย\nลิ้นงูเห่า\nลิ้นหมา\nลิบลับ\nลิบลิ่ว\nลิ่มเลือด\nลี้ภัย\nลี้ลับ\nลึกซึ้ง\nลึกลับ\nลึกล้ำ\nลืมตน\nลืมต้น\nลืมตัว\nลืมตา\nลืมเลือน\nลือชา\nลือชื่อ\nลือลั่น\nลุล่วง\nลุกลน\nลุกลาม\nลุกลี้ลุกลน\nลุกฮือ\nลุ่มน้ำ\nลุ่มลึก\nลุ่มหลง\nลุ่ทาง\nลูกกรง\nลูกกรอก\nลูกกรุง\nลูกกลอน\nลูกกลิ้ง\nลูกกวาด\nลูกกะจ๊อก\nลูกกุญแจ\nลูกเกด\nลูกแก้ว\nลูกขนไก่\nลูกข่าง\nลูกขุน\nลูกเขย\nลูกครึ่ง\nลูกคลื่น\nลูกความ\nลูกคอ\nลูกค้า\nลูกคิด\nลูกคู่\nลูกจ้าง\nลูกช้าง\nลูกชิด\nลูกชิ้น\nลูกชุบ\nลูกซอง\nลูกโซ่\nลูกดอก\nลูกดิ่ง\nลูกตะกั่ว\nลูกตุ้ม\nลูกเต้า\nลูกเต๋า\nลูกถ้วย\nลูกทุ่ง\nลูกเธอ\nลูกน้อง\nลูกน้ำ\nลูกนิมิต\nลูกบอล\nลูกบ้าน\nลูกบาศก์\nลูกบิด\nลูกเบี้ยว\nลูกประคบ\nลูกประคำ\nลูกปัด\nลูกปืน\nลูกโป่ง\nลูกผสม\nลูกผู้ชาย\nลูกผู้หญิง\nลูกพรรค\nลูกพี่\nลูกฟูก\nลูกไฟ\nลูกมือ\nลูกโม่\nลูกไม้\nลูกยาเธอ\nลูกรอก\nลูกรัง\nลูกเรือ\nลูกล้อ\nลูกลอย\nลูกเล่น\nลูกเลี้ยง\nลูกโลก\nลูกวัด\nลูกศร\nลูกศิษย์\nลูกสมุน\nลูกสะใภ้\nลูกสูบ\nลูกเสือ\nลูกหนัง\nลูกหนี้\nลูกหนู\nลูกหมาก\nลูกหลง\nลูกหลาน\nลูกหาบ\nลูกหิน\nลูกเห็บ\nลูกเหม็น\nลูกแหง่\nลูกอม\nลูกหม้อ\nลูบคม\nลูบคลำ\nลูบไล้\nเล็กน้อย\nเลขคณิต\nเลขผา\nเลขหมาย\nเล็ดลอด\nเล่นงาน\nเล่นแง่\nเล่นชู้\nเล่นตัว\nเล่นลิ้น\nเล่นหัว\nเลนส์นูน\nเลนส์เว้า\nเล็บครุฑ\nเลยเถิด\nเลศนัย\nเล่ห์กล\nเล่ห์เหลี่ยม\nเลอโฉม\nเลอมาน\nเลอเลิศ\nเลอสรวง\nเล่อล่า\nเลอะเลือน\nเล่าเรียน\nเล่าลือ\nเลาะลัด\nเลิกรา\nเลิกร้าง\nเลิกล้ม\nเลิศเลอ\nเลี้ยงชีพ\nเลี้ยงดู\nเลี้ยงต้อย\nเลียบเคียง\nเลี้ยวลด\nเลือกตั้ง\nเลือกเฟ้น\nเลือกสรร\nเลื่องลือ\nเลือดกำเดา\nเลือดเนื้อ\nเลือดฝาด\nเลือดเย็น\nเลือดร้อน\nเลือดหมู\nเลือดอุ่น\nเลือนราง\nเลื่อนเปื้อน\nเลื่อนลอย\nเลื่อมพราย\nเลื่อมใส\nเลื่อยฉลุ\nเลื่อยลันดา\nเลื่อยวงเดือน\nเลื้อยคลาน\nแลเหลียว\nแลกเปลี่ยน\nแล้วกัน\nและเล็ม\nโล่งใจ\nโล่งโถง\nโล่งอก\nโลดเต้น\nโลดโผน\nโลดลิ่ว\nโลดแล่น\nไล่ที่\nไล่เบี้ย\nไล่เลี่ย\nไล่เลียง\nไล่หลัง\nไล่ออก\nวกวน\nวงกบ\nวงกลม\nวงการ\nวงแขน\nวงเงิน\nวงจร\nวงนอก\nวงใน\nวงรี\nวงเล็บ\nวงเวียน\nวงแหวน\nวงศ์วาน\nวจีกรรม\nวจีเภท\nวจีภาค\nวนเวียน\nวอดวาย\nว็อบแว็บ\nวังวน\nวังหน้า\nวังหลวง\nวังหลัง\nวัดราษฎร์\nวัดวา\nวัดหลวง\nวัดผล\nวัดพื้น\nวัตถุนิยม\nวัตถุประสงค์\nวัตรปฏิบัติ\nวันโกน\nวันพระ\nวันเพ็ญ\nวัยรุ่น\nวัยวุฒิ\nว่ากล่าว\nว่าจ้าง\nว่าด้วย\nว่าที่\nวางก้าม\nวางใจ\nวางตัว\nวางตา\nวางโต\nวางท่า\nวางมวย\nวางมาด\nวางมือ\nวางวาย\nว่างเปล่า\nว่างเว้น\nวาดเขียน\nว่านเครือ\nวาบหวาม\nวายชนม์\nวายปราณ\nวายวาง\nวายวอด\nวายร้าย\nวายุภักษ์\nวาววับ\nวาววาม\nวาวแวว\nวาวแสง\nวิกฤตการณ์\nวิกฤติการณ์\nวิกฤตกาล\nวิกฤติกาล\nวิกลจริต\nวิงเวียน\nวิ่งเต้น\nวิ่งผลัด\nวิ่งรอก\nวิ่งราว\nวิจิตรศิลป์\nวิชาการ\nวิชาชีพ\nวิชาธร\nวิญญูชน\nวิดพื้น\nวิตกจริต\nวิถีทาง\nวิทยากร\nวิทยากล\nวิทยาการ\nวิทยาเขต\nวิทยาทาน\nวิทยาธร\nวิทยานิพนธ์\nวิทยาศาสตร์\nวิเทศสัมพันธ์\nวิธีการ\nวินัยธร\nวินัยปิฎก\nวินาศกรรม\nวินาศภัย\nวินาศสันตะโร\nวิภัชพยากรณ์\nวิภัชวาที\nวิไลวรรณ\nวิสัญญีแพทย์\nวิสัญญีภาพ\nวิสัญญีวิทยา\nวุฒิบัตร\nวุฒิสภา\nวุฒิสมาชิก\nวุ่นวาย\nวุ้นเส้น\nวูบวาบ\nเวจกุฎี\nเวจมรรค\nเวชกรรม\nเวชภัณฑ์\nเวชศาสตร์\nเวทมนตร์\nเวนคืน\nเวรกรรม\nเวฬุการ\nเวฬุวัน\nเว้าวอน\nเวิ้งว้าง\nเวียงวัง\nเวียนเทียน\nแว้งกัด\nแวดล้อม\nแวดวง\nแว่นขยาย\nแว่นแคว้น\nแว่นตา\nแวบวับ\nแววตา\nแวววาม\nแวววาว\nแวะเวียน\nโวยวาย\nไวไฟ\nไว้ใจ\nไว้ชื่อ\nไว้ตัว\nไว้ทุกข์\nไว้ลาย\nไว้หน้า\nไว้อาลัย\nศนิวาร\nศอกกลับ\nศอกกำ\nศอกกำมา\nศักดิ์ศรี\nศักดิ์สิทธิ์\nศารทวิษุวัติ\nศาลแขวง\nศาลจังหวัด\nศาลชั้นต้น\nศาลฎีกา\nศาลเตี้ย\nศาลทหาร\nศาลปกครอง\nศาลพระภูมิ\nศาลเพียงตา\nศาลแพ่ง\nศาลรัฐธรรมนูญ\nศาลแรงงาน\nศาลล้มละลาย\nศาลโลก\nศาลสูง\nศาลสูงสุด\nศาลอาญา\nศาลอุทธรณ์\nศาลากลาง\nศาลาดิน\nศาลาราย\nศาลาวัด\nศิลาฤกษ์\nศิลาแลง\nศิษย์เก่า\nศิษย์เอก\nศีลจุ่ม\nศีลธรรม\nศีลวัต\nศีลอด\nศูนย์กลาง\nศูนย์การค้า\nศูนย์ถ่วง\nศูนย์สูตร\nศูนย์หน้า\nเศร้าใจ\nเศร้าโศก\nเศร้าสร้อย\nเศร้าสลด\nเศร้าหมอง\nเศวตฉัตร\nเศษเกิน\nเศษซ้อน\nเศษวรรค\nเศษส่วน\nเศษเหล็ก\nโศกนาฏกรรม\nโศกศัลย์\nโศกเศร้า\nโศกสลด\nสกลโลก\nส่งเดช\nส่งท้าย\nส่งเสริม\nส่งเสีย\nส่งเสียง\nสงบเงียบ\nสงบเสงี่ยม\nสง่างาม\nสง่าราศี\nสดชื่น\nสดใส\nสตรีเพศ\nสติปัญญา\nสถลมารค\nสถานกงสุล\nสถานที่\nสถานทูต\nสถานการณ์\nสถานภาพ\nสถิติศาสตร์\nสนตะพาย\nสนใจ\nส้นตีน\nสนธิสัญญา\nสนนราคา\nสนับแข้ง\nสนับเพลา\nสนับมือ\nสนามบิน\nสนามเพลาะ\nสนิทสนม\nสนิมขุม\nสนิมสร้อย\nสนุกสนาน\nสบประมาท\nสบายใจ\nสภาพธรรม\nสมควร\nสมจริง\nสมใจ\nสมนัย\nสมน้ำหน้า\nสมประกอบ\nสมส่วน\nสมหวัง\nสมคบ\nสมทบ\nสมยอม\nสมรัก\nสมรู้\nสมสู่\nส้มฉุน\nส้มตำ\nส้มลิ้ม\nส้มกุ้ง\nส้มเช้า\nสมญานาม\nสมมติฐาน\nสมมุติฐาน\nสมมติเทพ\nสมรภูมิ\nสมัครใจ\nสมัยนิยม\nสมุทรศาสตร์\nสมุทรเสนา\nสยดสยอง\nสยองขวัญ\nสยามรัฐ\nสรรหา\nสรวมชีพ\nสรวลเส\nสร้อยเศร้า\nสร้างสรรค์\nสร้างเสริม\nสลดใจ\nสลบไสล\nสละสลวย\nสลาเหิน\nสลากภัต\nสวนครัว\nสวนป่า\nสวนสนุก\nสวนหย่อม\nส่วนกลาง\nส่วนเกิน\nส่วนตัว\nส่วนบุญ\nส่วนแบ่ง\nส่วนประกอบ\nส่วนพระองค์\nส่วนผสม\nส่วนรวม\nส่วนร่วม\nส่วนลด\nส่วนสัด\nสวมกอด\nสวมเขา\nสวมรอย\nสวยมภู\nสว่างไสว\nสวามิภักดิ์\nสวิงสวาย\nสสารนิยม\nส่อเสียด\nสอดคล้อง\nสอดแทรก\nสอดแนม\nสอบถาม\nสอบทาน\nสอบไล่\nสอบสวน\nส้อมเสียง\nสะสวย\nสะแกวัลย์\nสะแกแสง\nสะใจ\nสะเด็ดยาด\nสะเทือนใจ\nสะบัดช่อ\nสั่งสม\nสั่งสอน\nสั่งเสีย\nสังเกตการณ์\nสังคมนิยม\nสังคมวิทยา\nสังคมศาสตร์\nสังคมศึกษา\nสังคมสงเคราะห์\nสัญญาบัตร\nสัดส่วน\nสัตการ\nสัตบุรุษ\nสัตบริภัณฑ์\nสัตภัณฑ์\nสัตมหาสถาน\nสัตโลหะ\nสันเขา\nสันดอน\nสันหลัง\nสั่นเทา\nสั่นเทิ้ม\nสันติบาล\nสันติภาพ\nสันติวิธี\nสันติสุข\nสับเปลี่ยน\nสับสน\nสับหลีก\nสับหว่าง\nสัมมาคารวะ\nสัมมาชีพ\nส่าเหล้า\nสากกะเบือ\nสาทิสลักษณ์\nสาธุการ\nสาธุชน\nสาบเสือ\nสาปสรร\nสาปแช่ง\nสาปส่ง\nสามง่าม\nสามล้อ\nสามเหลี่ยม\nสามเวท\nสามัญชน\nสามัญสำนึก\nสายดิ่ง\nสายดิน\nสายตรวจ\nสายน้ำ\nสายบัว\nสายพาน\nสายฟ้า\nสายยาง\nสายยู\nสายใย\nสายรก\nสายรุ้ง\nสายล่อฟ้า\nสายลับ\nสายเลือด\nสายโลหิต\nสายวัด\nสายส่ง\nสายสวาท\nสายสะดือ\nสายสะพาย\nสายสัมพันธ์\nสายสิญจน์\nสายสืบ\nสายไหม\nสายอากาศ\nสายตา\nสายหยุด\nสารตรา\nสารประกอบ\nสารละลาย\nสารส้ม\nสารหนู\nสารทฤดู\nสาวใช้\nสาวน้อย\nสาวใหญ่\nสำนักงาน\nสำนักพิมพ์\nสำนักสงฆ์\nสำมะโนครัว\nสำเร็จรูป\nสิกขาบท\nสิงสถิต\nสิงสู่\nสิ่งก่อสร้าง\nสิ่งของ\nสิ่งปฏิกูล\nสิ่งพิมพ์\nสิ่งแวดล้อม\nสิ่งศักดิ์สิทธิ์\nสิทธิกร\nสิทธิ์ขาด\nสิทธิชัย\nสิทธิโชค\nสิทธิบัตร\nสินค้า\nสินจ้าง\nสินเชื่อ\nสินไถ่\nสินทรัพย์\nสินน้ำใจ\nสินบน\nสินแร่\nสินสมรส\nสินสอด\nสินไหม\nสิ้นเชิง\nสิ้นสุด\nสีผึ้ง\nสีลม\nสีชอล์ก\nสีถ่าน\nสีเทียน\nสีน้ำ\nสีน้ำมัน\nสีโปสเตอร์\nสีฝุ่น\nสี่เหลี่ยม\nสีหน้า\nสึกหรอ\nสืบทอด\nสืบค้น\nสืบสวน\nสืบสาว\nสืบเสาะ\nสื่อผสม\nสื่อมวลชน\nสื่อสาร\nสุกงอม\nสุกดิบ\nสุกปลั่ง\nสุกใส\nสุขนาฏกรรม\nสุขภัณฑ์\nสุขภาพ\nสุขลักษณะ\nสุขวิทยา\nสุขศาลา\nสุขศึกษา\nสุดท้าย\nสุตกวี\nสุนทรพจน์\nสุภาพชน\nสู่ขอ\nสู่รู้\nสู่สม\nสูงส่ง\nสูญเปล่า\nสูญสิ้น\nสูญเสีย\nสูญหาย\nเสสรวล\nเสแสร้ง\nเสกสรร\nเสถียรภาพ\nเส้นชัย\nเส้นตรง\nเส้นตาย\nเส้นทาง\nเส้นใย\nเส้นรุ้ง\nเส้นเลือด\nเส้นแวง\nเส้นสาย\nเส้นเสียง\nเส้นหมี่\nเส้นเอ็น\nเสบียงกรัง\nเสมอภาค\nเสมอหน้า\nเสมอเหมือน\nเสมียนตรา\nเสร็จสรรพ\nเสร็จสิ้น\nเสริมส่ง\nเสริมสร้าง\nเสริมสวย\nเสรีไทย\nเสรีธรรม\nเสรีนิยม\nเสรีภาพ\nเสาเข็ม\nเสาธง\nเสียใจ\nเสียเชิง\nเสียดาย\nเสียที\nเสียเที่ยว\nเสียเปรียบ\nเสียเปล่า\nเสียรู้\nเสียแรง\nเสียสละ\nเสียหลัก\nเสียหาย\nเสี่ยงทาย\nเสียดแทง\nเสียดแทรก\nเสียดสี\nเสี้ยนศึก\nเสี้ยนหนาม\nเสี้ยมสอน\nเสียวซ่าน\nเสียวไส้\nเสือดาว\nเสือดำ\nเสือปลา\nเสือป่า\nเสือไฟ\nเสื่อกก\nเสื่อกระจูด\nเสื่อน้ำมัน\nเสื่อลำแพน\nเสื้อกล้าม\nเสื้อกั๊ก\nเสื้อเกราะ\nเสื้อครุย\nเสื้อแสง\nเสื้อเมือง\nเสือกคลาน\nเสือกสน\nเสือกไส\nเสื่อมคลาย\nเสื่อมถอย\nเสื่อมทราม\nเสื่อมโทรม\nเสื่อมสลาย\nเสื่อมสูญ\nเสื่อมเสีย\nเสือหมอบ\nแสกหน้า\nแสดงออก\nแสเถา\nแสนกล\nแสนรู้\nแสร้งว่า\nใส่ความ\nใส่ไคล้\nใส่ใจ\nใส่ไฟ\nไส้กรอก\nไส้ไก่\nไส้ติ่ง\nไส้ศึก\nไส้อั่ว\nไส้เดือน\nไส้ตัน\nไสยเวท\nไสยศาสตร์\nหกล้ม\nหงส์หยก\nหงอนไก่\nหงอยก๋อย\nหงอยเหงา\nหงายท้อง\nหงายหลัง\nหงำเหงอะ\nหงำเหงือก\nหดหาย\nหดหู่\nหนทาง\nหนวกหู\nหน่วงเหนี่ยว\nหน่วยก้าน\nหน่อไม้\nหนองใน\nหนองแซง\nหนักข้อ\nหนักใจ\nหนักแน่น\nหนักหน่วง\nหนักหนา\nหนังกลับ\nหนังตะลุง\nหนังเรียด\nหนังสด\nหนังใหญ่\nหนังสือพิมพ์\nหนาแน่น\nหน้ากระดาน\nหน้ากาก\nหน้ากาฬ\nหน้าแข้ง\nหน้าจั่ว\nหน้าฉาน\nหน้าตัก\nหน้าตา\nหน้าต่าง\nหน้าท้อง\nหน้าทับ\nหน้าที่\nหน้าที่นั่ง\nหน้าบัน\nหน้าปัด\nหน้าผา\nหน้าผาก\nหน้าม้า\nหน้ามุข\nหน้าไม้\nหน้าเลือด\nหน้าอก\nหนามเตย\nหน่ายหนี\nหน่ายแหนง\nหนาวเหน็บ\nหนำใจ\nหนี้สิน\nหนี้สูญ\nหนุนเนื่อง\nหนุนหลัง\nหมกมุ่น\nหมดจด\nหมอขวัญ\nหมอความ\nหมอแคน\nหมองู\nหมอดู\nหมอตำแย\nหมอทำขวัญ\nหมอนวด\nหมอผี\nหมอยา\nหมอลำ\nหมอเสน่ห์\nหม้อแกง\nหม้อตาล\nหม้อน้ำ\nหม้อแปลง\nหมองใจ\nหมองมัว\nหมองหม่น\nหมองหมาง\nหมอนขวาน\nหมอนข้าง\nหมอนทอง\nหม่อมเจ้า\nหม่อมฉัน\nหม่อมราชวงศ์\nหม่อมหลวง\nหม่อมห้าม\nหมั่นไส้\nหมาป่า\nหมาหมู่\nหมากฝรั่ง\nหมากสง\nหมากหอม\nหมากเก็บ\nหมากรุก\nหมากเม่า\nหมางใจ\nหมางเมิน\nหมาไม้\nหมายเกณฑ์\nหมายขัง\nหมายค้น\nหมายความ\nหมายจับ\nหมายใจ\nหมายตา\nหมายปล่อย\nหมายมั่น\nหมายเรียก\nหมายเลข\nหมายเหตุ\nหมิ่นเหม่\nหมึกจีน\nหมุนเวียน\nหมูแดง\nหมูป่า\nหมูแผ่น\nหมูยอ\nหมูหย็อง\nหมูหัน\nหมูแฮม\nหมู่บ้าน\nหยดย้อย\nหยอกเย้า\nหยักรั้ง\nหยักศก\nหยั่งทราบ\nหยั่งรู้\nหยั่งเสียง\nหยาบคาย\nหยาบช้า\nหยาบโลน\nหยาบหยาม\nหยิบมือ\nหยิบยก\nหยิบยืม\nหยิบหย่ง\nหยิบโหย่ง\nหริรักษ์\nหริวงศ์\nหลงผิด\nหลบฉาก\nหลบมุม\nหลวงจีน\nหลวงพ่อ\nหลวมตัว\nหล่อลื่น\nหล่อเลี้ยง\nหล่อหลอม\nหลอกลวง\nหลอกล่อ\nหลอกล้อ\nหลอดลม\nหลอดเลือด\nหลอดอาหาร\nหลอมตัว\nหลอมเหลว\nหลักการ\nหลักเกณฑ์\nหลักชัย\nหลักฐาน\nหลักทรัพย์\nหลักเมือง\nหลักลอย\nหลักสูตร\nหลักแหล่ง\nหลักแหลม\nหลังคา\nหลังเต่า\nหลั่งไหล\nหลับนก\nหลับใน\nหลากใจ\nหลากหลาย\nหลาบจำ\nหลายหลาก\nหลายแหล่\nหลุดพ้น\nหลุดลอย\nหลุดลุ่ย\nหลุมโจน\nหลุมพราง\nหวงก้าง\nหวงห้าม\nหวงแหน\nห่วงใย\nห้วงน้ำ\nหวังใจ\nหวังดี\nหวั่นกลัว\nหวั่นเกรง\nหวั่นใจ\nหวั่นวิตก\nหวั่นหวาด\nหวั่นไหว\nหวาดกลัว\nหวาดเกรง\nหวาดผวา\nหวาดเสียว\nหวาดหวั่น\nหวาดไหว\nหวานเย็น\nหว่านล้อม\nหอคอย\nหอคำ\nหอฉัน\nหอไตร\nหอประชุม\nหอพัก\nห่อหมก\nห่อเหี่ยว\nหอกซัด\nห้องเครื่อง\nห้องชุด\nห้องแถว\nห้องโถง\nห้องน้ำ\nห้องสมุด\nหอสมุด\nหอมหวน\nห้อมล้อม\nห้อยโหน\nหักล้าง\nหักหาญ\nหักห้าม\nหักเห\nหักโหม\nหักมุก\nหันเห\nหับเผย\nหัวขโมย\nหัวข้อ\nหัวขั้ว\nหัวเข่า\nหัวโขน\nหัวคะแนน\nหัวค่ำ\nหัวคิด\nหัวจุก\nหัวโจก\nหัวใจ\nหัวเทียน\nหัวนม\nหัวนอน\nหัวป่า\nหัวมุม\nหัวเรื่อง\nหัวแร้ง\nหัวใส\nหัวหน้า\nหัวหน่าว\nหัวหอก\nหัวเห็ด\nหัวไหล่\nหัวอก\nหัสดนตรี\nหัสนาฏกรรม\nหัสนิยาย\nหัสดีลิงค์\nหางเครื่อง\nหางแถว\nหางเลข\nหางว่าว\nหางเสียง\nหางเสือ\nห่างเหิน\nหาบเร่\nห้ามปราม\nห้ามล้อ\nหายตัว\nหาวนอน\nห้าวหาญ\nห้ำหั่น\nหินงอก\nหินทราย\nหินปูน\nหินย้อย\nหินอ่อน\nหินชาติ\nหินยาน\nหีบเพลง\nหีบห่อ\nหุ่นกระบอก\nหุ่นยนต์\nหุ้นลม\nหุ้นส่วน\nหุบเขา\nหุบผา\nหุบเหว\nหูกระต่าย\nหูช้าง\nหูรูด\nหูกวาง\nเหงาหงอย\nเหงื่อกาฬ\nเหตุการณ์\nเหตุผล\nเห็นแก่\nเห็นใจ\nเหน็บแนม\nเหน็บชา\nเหนียวแน่น\nเหนี่ยวนำ\nเหนี่ยวรั้ง\nเหนื่อยหน่าย\nเหมาะเจาะ\nเหมาะสม\nเหมาะเหม็ง\nเหยเก\nเหยียดหยาม\nเหล็กกล้า\nเหล็กจาร\nเหล็กใน\nเหล็กส่ง\nเหล็กเส้น\nเหล็กหล่อ\nเหล็กไหล\nเหลวแหลก\nเหลวไหล\nเหลอหลา\nเหล่ากอ\nเหลียวแล\nเหลือเกิน\nเหลือขอ\nเหลือใจ\nเหลือเชื่อ\nเหลือเฟือ\nเหลือร้าย\nเหลือล้น\nเหลือหลาย\nเหลือแหล่\nเหลือแสน\nเหลือหลอ\nเหลื่อมล้ำ\nเห่อเหิม\nเหินห่าง\nเหิมเกริม\nเหิมหาญ\nเหี้ยมเกรียม\nเหี้ยมหาญ\nเหี้ยมโหด\nเหี่ยวแห้ง\nเหือดหาย\nเหือดแห้ง\nแห่แหน\nแหนงหน่าย\nแหลกลาญ\nแหลกเหลว\nแหวกแนว\nแหวกว่าย\nโหงพราย\nโหดร้าย\nโหดเหี้ยม\nโหยหวน\nโหวงเหวง\nให้การ\nให้ท่า\nให้ท้าย\nให้ร้าย\nให้หลัง\nไหมพรม\nไหวพริบ\nอกไก่\nอกร่อง\nองค์กร\nองค์การ\nอดกลั้น\nอดทน\nอดสู\nอดอยาก\nอดออม\nอดีตกาล\nอดีตชาติ\nอดีตภพ\nอติชาตบุตร\nอธิการบดี\nอนาคตกาล\nอนิจกรรม\nอนุชาตบุตร\nอเนกประสงค์\nอบรม\nอบอวล\nอบอ้าว\nอบอุ่น\nอบายภูมิ\nอบายมุข\nอภัพบุคคล\nอภัยทาน\nอภัยโทษ\nอภิชาตบุตร\nอมยิ้ม\nอมรรัตน์\nอมฤตบท\nอมฤตรส\nอย่างไร\nอรรถกร\nอรรถกวี\nอรรถคดี\nอรรถประโยชน์\nอรรถรส\nอรรธนิศา\nอรรธภาค\nอรรธสระ\nอรสุมพล\nอรูปฌาน\nอรูปพรหม\nอรูปภพ\nอรูปภูมิ\nอวชาตบุตร\nอวดดี\nอวดอ้าง\nอ้วนท้วน\nอ้วนพี\nอวบอั๋น\nอวยชัย\nอวยพร\nอสุภกรรมฐาน\nอสุภสัญญา\nอโหสิกรรม\nออเจ้า\nออกแขก\nออกตัว\nออกโรง\nออกฤทธิ์\nออกลาย\nออกหาก\nออดอ้อน\nออดแอด\nอ่อนข้อ\nอ่อนใจ\nอ่อนช้อย\nอ่อนน้อม\nอ่อนเปลี้ย\nอ่อนเพลีย\nอ่อนโยน\nอ่อนหวาน\nอ่อนหัด\nอ่อนไหว\nอ่อนแอ\nอ้อนวอน\nอ้อนออด\nอ้อมค้อม\nอักษรศาสตร์\nอักษรสาส์น\nอัคคีภัย\nอัญชนะศักราช\nอัดฉีด\nอัดอั้น\nอัตราส่วน\nอันโตชน\nอันโตนาที\nอับจน\nอับเฉา\nอับอาย\nอัสสุชล\nอัสสุธารา\nอากัปกิริยา\nอาการนาม\nอากาศธาตุ\nอากาศยาน\nอาคารชุด\nอ่างเก็บน้ำ\nอ้างอิง\nอาจหาญ\nอาจอง\nอาชญากร\nอาชญากรรม\nอาชญาบัตร\nอาชญาสิทธิ์\nอาญาสิทธิ์\nอาณาเขต\nอาณาจักร\nอาณานิคม\nอาณาประโยชน์\nอาโปกสิณ\nอาโปธาตุ\nอาภากร\nอายุขัย\nอายุวัฒนะ\nอาโลกกสิณ\nอาหารว่าง\nอำพราง\nอิดโรย\nอิดออด\nอิดเอื้อน\nอิตถีลิงค์\nอิทธิปาฏิหาริย์\nอิทธิพล\nอิทธิฤทธิ์\nอินังขังขอบ\nอิ่มตัว\nอิ่มหนำ\nอิ่มเอม\nอิ่มเอิบ\nอีฉัน\nอีตัว\nอึงคะนึง\nอึงมี่\nอึงอล\nอึ่งยาง\nอึ่งอ่าง\nอึดใจ\nอึดอัด\nอืดอาด\nอื้อฉาว\nอื้อซ่า\nอื้ออึง\nอุกฉกรรจ์\nอุกอาจ\nอุดอู้\nอุ่นเครื่อง\nอุ่นใจ\nอุบอิบ\nอุบัติภัย\nอุบัติเหตุ\nอุโบสถกรรม\nอุโบสถหัตถี\nอุปมาโวหาร\nอุ้มชู\nอุ้มสม\nอุ้ยอ้าย\nอู้อี้\nเอกจิต\nเอกฉันท์\nเอกชน\nเอกเทศ\nเอกนัย\nเอกบุคคล\nเอกบุรุษ\nเอกพจน์\nเอกภพ\nเอกภาพ\nเอกมัย\nเอกราช\nเอกรูป\nเอกลักษณ์\nเอกศก\nเอกสาร\nเอกสิทธิ์\nเอกอุ\nเอ็ดอึง\nเอนเอียง\nเอมอร\nเอออวย\nเออออ\nเอาการ\nเอางาน\nเอาจริง\nเอาใจ\nเอาเปรียบ\nเอาเยี่ยง\nเอิบอาบ\nเอียงอาย\nเอียงเอน\nเอื้อเฟื้อ\nโอ่โถง\nโอ้โถง\nโอ่อวด\nโอ้อวด\nโอ่อ่า\nโอ้โลม\nโอดครวญ\nโอดโอย\nโอนอ่อน\nโอนเอน\nโอบอ้อม\nโอบอุ้ม\nโอสถกรรม\nไอเสีย\nไอกรน\nฮวบฮาบ\nฮาป่า\nฮึกหาญ\nฮึกห้าว\nฮึกเหิม\nฮึกโหม\nฮึกฮัก\nเฮงซวย\nโฮกฮือ\nโฮกฮาก\n\nก็\nกก\nก๊ก\nกกุธภัณฑ์\nกง\nก่ง\nก้ง\nก๊ง\nก๋ง\nกงกอน\nกงไฉ่\nกงเต๊ก\nกงสี\nกงสุล\nกช\nกฎ\nกฏุก\nกฐิน\nกณิกนันต์\nกณิการ์\nกด\nกตเวทิตา\nกตเวที\nกตัญชลี\nกตัญญุตา\nกตัญญู\nกตาธิการ\nกตาภินิหาร\nกติกา\nกถา\nกถิกาจารย์\nกทลี\nกน\nก่น\nก้น\nกนก\nกนิษฐ์\nกนิษฐา\nกบ\nกบฏ\nกบดาน\nกบทู\nกบาล\nกบินทร์\nกบิล\nกบี่\nกบูร\nกเบนทร์\nกม\nก้ม\nกมณฑลาภิเษก\nกมณฑโลทก\nกมล\nกมลา\nกมลาศ\nกมลาสน์\nกมเลศ\nกมัณฑลุ\nกมุท\nกร\nกรกฎ\nกรกฎาคม\nกรกฏ\nกรง\nกรชกาย\nกรณฑ์\nกรณิการ์\nกรณี\nกรณีย์\nกรณียกิจ\nกรณียะ\nกรด\nกรน\nกรบ\nกรบูร\nกรพินธุ์\nกรม\nกรรกฎ\nกรรกศ\nกรรเกด\nกรรไกร\nกรรเจียก\nกรรชิง\nกรรเชียง\nกรรโชก\nกรรฐ์\nกรรฐา\nกรรณ\nกรรณา\nกรรณิกา\nกรรณิการ์\nกรรดิ\nกรรดิก\nกรรดึก\nกรรตุ\nกรรไตร\nกรรทบ\nกรรแทก\nกรรบิด\nกรรบูร\nกรรภิรมย์\nกรรม\nกรรม์\nกรรม์ภิรมย์\nกรรมชวาต\nกรรมัชวาต\nกรรมาชีพ\nกรรมาธิการ\nกรรมาร\nกรรษก\nกรรสะ\nกรรแสง\nกรวด\nกรวบ\nกรวม\nกร้วม\nกรวย\nกรวิก\nกรสาปน์\nกรสุทธิ์\nกรอ\nกร้อ\nกรอก\nกร็อกกร๋อย\nกรอกแกรก\nกรอง\nกรองกรอย\nกรอด\nกร่อน\nกรอบ\nกรอม\nกร่อย\nกระ\nกระกร\nกระกรุ่น\nกระกลับกลอก\nกระกี้\nกระเกรอก\nกระเกริก\nกระเกริ่น\nกระคน\nกระคาย\nกระงกกระเงิ่น\nกระง่องกระแง่ง\nกระง่อนกระแง่น\nกระเง้ากระงอด\nกระโงก\nกระจก\nกระจง\nกระจร\nกระจอก\nกระจองหง่อง\nกระจ๋องหง่อง\nกระจองอแง\nกระจ้อน\nกระจอนหู\nกระจ้อย\nกระจ๋อหวอ\nกระจะ\nกระจัก\nกระจัง\nกระจัด\nกระจับ\nกระจ่า\nกระจ่าง\nกระจาด\nกระจาน\nกระจาบ\nกระจาม\nกระจาย\nกระจาว\nกระจิก\nกระจิ๋ง\nกระจิด\nกระจิบ\nกระจิ๋ม\nกระจิริด\nกระจิ๋ว\nกระจี้\nกระจี๋\nกระจุก\nกระจุ๋งกระจิ๋ง\nกระจุบ\nกระจุ๊บ\nกระจุ๋มกระจิ๋ม\nกระจุย\nกระจู้\nกระจู๋กระจี๋\nกระจูด\nกระเจอะกระเจิง\nกระเจา\nกระเจ่า\nกระเจ้า\nกระเจาะ\nกระเจิง\nกระเจิดกระเจิง\nกระเจี้ยง\nกระเจี๊ยบ\nกระเจียว\nกระเจี๊ยว\nกระแจะ\nกระโจน\nกระโจม\nกระฉอก\nกระฉ่อน\nกระฉับกระเฉง\nกระฉิ่ง\nกระฉีก\nกระฉูด\nกระเฉก\nกระเฉด\nกระแฉก\nกระโฉกกระเฉก\nกระโฉม\nกระชดกระช้อย\nกระชอน\nกระชอมดอก\nกระช้อย\nกระชัง\nกระชั้น\nกระชับ\nกระชาก\nกระชาย\nกระชิง\nกระชิด\nกระชุ\nกระชุก\nกระชุ่มกระชวย\nกระเชอ\nกระเชา\nกระเช้า\nกระเชียง\nกระแชง\nกระแชะ\nกระโชก\nกระซ่องกระแซ่ง\nกระซับ\nกระซาบ\nกระซิก\nกระซิบ\nกระซี้\nกระซุง\nกระซุบกระซิบ\nกระซุ้ม\nกระซู่\nกระเซ็น\nกระเซอ\nกระเซอะกระเซอ\nกระเซอะกระเซิง\nกระเซ้า\nกระเซิง\nกระแซ\nกระแซะ\nกระโซกระเซ\nกระฎี\nกระฎุมพี\nกระดก\nกระด้ง\nกระดนโด่\nกระดวง\nกระดวน\nกระด้วมกระเดี้ยม\nกระดอ\nกระดอง\nกระดองหาย\nกระดอน\nกระดอม\nกระดักกระเดี้ย\nกระดังงัว\nกระดังงา\nกระดาก\nกระด้าง\nกระดางลาง\nกระดาด\nกระดาดขาว\nกระดาน\nกระดานพน\nกระดาษ\nกระดำกระด่าง\nกระดิก\nกระดิ่ง\nกระดิ้ง\nกระดิบ\nกระดี่\nกระดี้กระเดียม\nกระดึง\nกระดืบ\nกระดุ\nกระดุกกระดิก\nกระดุ้งกระดิ้ง\nกระดุบ\nกระดุบกระดิบ\nกระดุม\nกระดูก\nกระเดก\nกระเด้ง\nกระเด็น\nกระเด้า\nกระเดาะ\nกระเดิด\nกระเดี้ย\nกระเดียด\nกระเดือก\nกระเดื่อง\nกระแด็ก\nกระแด้ง\nกระแด้แร่\nกระแด่ว\nกระแดะ\nกระโดก\nกระโดง\nกระโดด\nกระโดน\nกระได\nกระตรับ\nกระตราก\nกระตรุด\nกระตรุม\nกระต้วมกระเตี้ยม\nกระต้อ\nกระต่องกระแต่ง\nกระต๊อบ\nกระต้อยตีวิด\nกระตัก\nกระตั้ว\nกระต่าย\nกระติก\nกระติ๊ด\nกระติบ\nกระตือรือร้น\nกระตุก\nกระตุ้งกระติ้ง\nกระตุ่น\nกระตุ้น\nกระตูบ\nกระเตง\nกระเต็น\nกระเตอะ\nกระเตาะ\nกระเตาะกระแตะ\nกระเตื้อง\nกระแต\nกระแตแต้แว้ด\nกระโตกกระตาก\nกระโตน\nกระถด\nกระถอบ\nกระถั่ว\nกระถาง\nกระถิก\nกระถิน\nกระเถิบ\nกระโถน\nกระทก\nกระทง\nกระทบ\nกระทรวง\nกระทอก\nกระท้อน\nกระท่อนกระแท่น\nกระท่อม\nกระท้อมกระแท้ม\nกระทะ\nกระทั่ง\nกระทั้น\nกระทา\nกระทาย\nกระทาสี\nกระทาหอง\nกระทำ\nกระทิกกระทวย\nกระทิง\nกระทึง\nกระทืบ\nกระทุ\nกระทุง\nกระทุ้ง\nกระทุ่ม\nกระทู้\nกระเท่\nกระเทียบ\nกระเทียม\nกระเทือน\nกระเทื้อม\nกระแทก\nกระแท่น\nกระแทะ\nกระไทชาย\nกระน่อง\nกระนั้น\nกระนี้\nกระแนะกระแหน\nกระโน้น\nกระไน\nกระบก\nกระบม\nกระบวน\nกระบวย\nกระบวร\nกระบอก\nกระบอง\nกระบะ\nกระบัด\nกระบั้วกระเบี้ย\nกระบ่า\nกระบ้า\nกระบาก\nกระบาย\nกระบาล\nกระบิ\nกระบิ้ง\nกระบิด\nกระบิล\nกระบี่\nกระบือ\nกระบุง\nกระบุ่มกระบ่าม\nกระบู้กระบี้\nกระบูน\nกระบูร\nกระเบง\nกระเบญ\nกระเบ็ดกระบวน\nกระเบน\nกระเบา\nกระเบิก\nกระเบียด\nกระเบียน\nกระเบื้อง\nกระแบก\nกระแบะ\nกระโบม\nกระปมกระปำ\nกระปมกระเปา\nกระปรอก\nกระปรอกว่าว\nกระปรี้กระเปร่า\nกระปอก\nกระป้อกระแป้\nกระป่อง\nกระป๋อง\nกระปอดกระแปด\nกระป๋อหลอ\nกระปั้วกระเปี้ย\nกระป่ำ\nกระปุก\nกระปุ๊กลุก\nกระปุ่ม\nกระปุ่มกระป่ำ\nกระปุ่มกระปิ่ม\nกระเป๋า\nกระเปาะ\nกระโปก\nกระโปรง\nกระผม\nกระผลีกระผลาม\nกระผาน\nกระผีก\nกระพรวน\nกระพริ้ม\nกระพอก\nกระพอง\nกระพ้อม\nกระพัก\nกระพัง\nกระพังเหิร\nกระพังโหม\nกระพัด\nกระพัตร\nกระพัน\nกระพั่น\nกระพา\nกระพาก\nกระพี้\nกระพือ\nกระพุ้ง\nกระพุ่ม\nกระเพาะ\nกระเพิง\nกระเพื่อม\nกระแพ้ง\nกระฟัดกระเฟียด\nกระฟูมกระฟาย\nกระมล\nกระมอบ\nกระมอมกระแมม\nกระมัง\nกระมัน\nกระมิดกระเมี้ยน\nกระมุท\nกระเมาะ\nกระย่อง\nกระย่องกระแย่ง\nกระย่อน\nกระย่อม\nกระยา\nกระยาง\nกระยาจก\nกระยาหงัน\nกระยิก\nกระยิ้มกระย่อง\nกระยึกกระยือ\nกระยืดกระยาด\nกระเย้อกระแหย่ง\nกระรอก\nกระเรียน\nกระโรกน้ำข้าว\nกระโรกใหญ่\nกระไร\nกระลด\nกระลบ\nกระลอก\nกระลอม\nกระละหล่ำ\nกระลัด\nกระลับ\nกระลัมพร\nกระลา\nกระลำ\nกระลำพัก\nกระลำพุก\nกระลิง\nกระลี\nกระลุมพาง\nกระลุมพุก\nกระลุมพู\nกระลูน\nกระลู่น์\nกระเล็น\nกระเลียด\nกระเลือก\nกระโลง\nกระวน\nกระวัด\nกระวาด\nกระวาน\nกระวาย\nกระวิน\nกระวี\nกระวีกระวาด\nกระวูดกระวาด\nกระเวน\nกระเวยกระวาย\nกระแวน\nกระโวยกระวาย\nกระษัย\nกระษาปณ์\nกระสง\nกระสบ\nกระสม\nกระสรวล\nกระสร้อย\nกระสวน\nกระสวย\nกระสอบ\nกระสะ\nกระสัง\nกระสัน\nกระสับกระส่าย\nกระสา\nกระสานติ์\nกระสาบ\nกระสาย\nกระสือ\nกระสุงกระสิง\nกระสุน\nกระสูทธิ์\nกระสูบ\nกระเสด\nกระเส็นกระสาย\nกระเส่า\nกระเสาะกระแสะ\nกระเสียน\nกระเสียร\nกระเสือกกระสน\nกระแส\nกระแสง\nกระแสะ\nกระโสง\nกระไส\nกระหนก\nกระหนาก\nกระหนาบ\nกระหน่ำ\nกระหมวด\nกระหมอบ\nกระหม่อม\nกระหมั่ง\nกระหมิบ\nกระหมุดกระหมิด\nกระหมุบ\nกระหย่ง\nกระหย่อม\nกระหยัง\nกระหยับ\nกระหยิ่ม\nกระหรอด\nกระหริ่ง\nกระหวน\nกระหวัด\nกระหอง\nกระหัง\nกระหัด\nกระหาง\nกระหาย\nกระหึม\nกระหึ่ม\nกระหืดกระหอบ\nกระเห็น\nกระเหนียด\nกระเหม็ดกระเหมียด\nกระเหม็ดกระแหม่\nกระเหม่น\nกระเหม่า\nกระเหว่า\nกระเห่อ\nกระเหิม\nกระเหี้ยนกระหือรือ\nกระแห\nกระแหทอง\nกระแหนบ\nกระแหนะ\nกระแหมบ\nกระแหม่ว\nกระแหย่ง\nกระแหร่ม\nกระแหล่ง\nกระโห้\nกระโหนด\nกระโหม\nกระโหย\nกระโหย่ง\nกระอวล\nกระอ้อกระแอ้\nกระออดกระแอด\nกระออบ\nกระออม\nกระอ้อมกระแอ้ม\nกระอัก\nกระอักกระอ่วน\nกระอั้วแทงควาย\nกระอ้า\nกระอาน\nกระอิด\nกระอิดกระเอื้อน\nกระอึก\nกระอืด\nกระอุ\nกระอุก\nกระเอา\nกระเอิก\nกระเอิบ\nกระแอก\nกระแอบ\nกระแอม\nกระไอ\nกรัก\nกรักขี\nกรัง\nกรัชกาย\nกรัณฑ์\nกรัณย์\nกรัน\nกรับ\nกรัม\nกราก\nกราง\nกร่าง\nกราด\nกราดวง\nกราน\nกร้าน\nกราบ\nกราฟ\nกราม\nกราย\nกร่าย\nกราว\nกร้าว\nกรำ\nกร่ำ\nกริก\nกริ๊ก\nกริกกริว\nกริกกรี\nกริ่ง\nกริ๊ง\nกริงกริว\nกริ้งกริ้ว\nกริช\nกริณี\nกริน\nกรินทร์\nกรินี\nกริบ\nกริม\nกริ่ม\nกริยา\nกริยานุเคราะห์\nกริว\nกริ้ว\nกรี\nกรีฑา\nกรีด\nกรี๊ด\nกรีธา\nกรีษ\nกรีส\nกรึ๊บ\nกรุ\nกรุง\nกรุ้งกริ่ง\nกรุณ\nกรุณา\nกรุณาธิคุณ\nกรุ่น\nกรุบ\nกรุ่ม\nกรุ้มกริ่ม\nกรุย\nกรุยเกรียว\nกรู\nกรูด\nกรูม\nกเรณุ\nกเรนทร\nกเรนทร์\nกฤช\nกฤดาภินิหาร\nกฤตติกา\nกฤษฎา\nกฤษฎาธาร\nกฤษฎาภินิหาร\nกฤษฎีกา\nกฤษณา\nกล\nกลด\nกล่น\nกลบ\nกลม\nกลละ\nกลวง\nกล้วย\nกลศ\nกล้อ\nกลอก\nกลอง\nกล่อง\nกล้อง\nกล้องแกล้ง\nกลอน\nกล่อน\nกล้อน\nกล่อม\nกล้อมแกล้ม\nกลอย\nกลัก\nกลัด\nกลั่น\nกลั้น\nกลันทก์\nกลันทะ\nกลับ\nกลัมพร\nกลัมพัก\nกลัว\nกลั้ว\nกลา\nกล้า\nกลาก\nกลากลาด\nกลาง\nกลาด\nกลาบาต\nกลาป\nกล้าม\nกลาย\nกล้าย\nกลายกลอก\nกล่าว\nกลาโหม\nกล่ำ\nกล้ำ\nกลิ้ง\nกลิงค์\nกลิ่น\nกลี\nกลีบ\nกลึง\nกลึงค์\nกลืน\nกลุ่ม\nกลุ้ม\nกลูโคส\nกเลวระ\nกวด\nกวน\nกวม\nกวย\nกวยจั๊บ\nกวยจี๊\nก๋วยเตี๋ยว\nกวัก\nกวัด\nกวา\nกว่า\nกวาง\nกว่าง\nกว้าง\nกว่างโซ้ง\nกวางตุ้ง\nกวาด\nกว้าน\nกว๊าน\nกว้าว\nกวาวเครือ\nกวี\nกษณะ\nกษมา\nกษัตร\nกษัตรา\nกษัตริย์\nกษัตรี\nกษัตรีย์\nกษัย\nกษาปณ์\nกษิดิ\nกษีร\nกษีรธารา\nกษีระ\nกสานติ์\nกสิกร\nกสิกรรม\nกสิณ\nกหังปายา\nกหาปณะ\nกเฬวราก\nกอ\nก่อ\nก้อ\nก๊อ\nกอก\nก๊อก\nกอแก\nกอง\nก่อง\nก้อง\nกองกอย\nก๊อซ\nกอด\nก่อน\nก้อน\nกอบ\nกอบนาง\nก๊อบปี้\nกอปร\nก้อม\nกอมก้อ\nก่อมก้อ\nกอย\nก้อย\nก๋อย\nกอริลลา\nกอล์ฟ\nกอและ\nกอเอี๊ยะ\nกะ\nกะกัง\nกะง้องกะแง้ง\nกะจัง\nกะแจะ\nกะชะ\nกะชัง\nกะชามาศ\nกะชิง\nกะชึ่กกะชั่ก\nกะแช่\nกะซวก\nกะซ้าหอย\nกะซี่\nกะโซ่\nกะโซ้\nกะดก\nกะดง\nกะดวน\nกะดอก\nกะดะ\nกะดังบาย\nกะดัด\nกะด้าง\nกะดำกะด่าง\nกะดี\nกะดี่\nกะดุ้ง\nกะเด้\nกะเดก\nกะเดี๋ยว\nกะตรุด\nกะตอก\nกะต่อย\nกะตัก\nกะตั้ก\nกะตัง\nกะตังกะติ้ว\nกะต๊าก\nกะต้ำ\nกะติ๊กริก\nกะติงกะแตง\nกะตีบ\nกะตึงกะแตง\nกะตุ๊ก\nกะตุด\nกะตูก\nกะเตง\nกะโต๊ก\nกะโตงกะเตง\nกะโต้งโห่ง\nกะถัว\nกะทกรก\nกะทอ\nกะทัง\nกะทังหัน\nกะทัดรัด\nกะทันหัน\nกะทับ\nกะทิ\nกะทือ\nกะทุน\nกะเทย\nกะเทาะ\nกะแท้\nกะแท่ง\nกะแทน\nกะนวล\nกะนัด\nกะบ่อนกะแบ่น\nกะบัง\nกะบั้ง\nกะบิ้ง\nกะบิล\nกะบึงกะบอน\nกะบุด\nกะเบ้อ\nกะเบียน\nกะเบือ\nกะปริดกะปรอย\nกะปริบ\nกะปริบกะปรอย\nกะปลกกะเปลี้ย\nกะปวกกะเปียก\nกะปอม\nกะปอมขาง\nกะปะ\nกะป้ำกะเป๋อ\nกะปิ\nกะปู\nกะปูด\nกะปูดหลูด\nกะเปะ\nกะเปิ๊บกะป๊าบ\nกะเปียด\nกะแป้น\nกะแปะ\nกะโปรง\nกะโปโล\nกะผลุบกะโผล่\nกะเผ่น\nกะเผลก\nกะโผลกกะเผลก\nกะพง\nกะพรวดกะพราด\nกะพร่องกะแพร่ง\nกะพริบ\nกะพรุน\nกะพรูดกะพราด\nกะพล้อ\nกะพ้อ\nกะเพรา\nกะเพียด\nกะเม็ง\nกะร่องกะแร่ง\nกะระตะ\nกะระหนะ\nกะรัง\nกะรัต\nกะราง\nกะริง\nกะรุงกะรัง\nกะรุ่งกะริ่ง\nกะรุน\nกะเร\nกะเรกะร่อน\nกะเร่กะร่อน\nกะเร่อ\nกะเรี่ยกะราด\nกะโรกะเร\nกะลวย\nกะลอ\nกะล่อกะแล่\nกะลอจี๊\nกะล่อน\nกะล่อมกะแล่ม\nกะล่อยกะหลิบ\nกะละปังหา\nกะละมัง\nกะละแม\nกะละออม\nกะลังตังไก่\nกะลัน\nกะลันทา\nกะลา\nกะลาง\nกะลาสี\nกะลำพอ\nกะลิง\nกะลิงปลิง\nกะลิ้มกะเหลี่ย\nกะลิอ่อง\nกะลุมพี\nกะเล็ง\nกะเล่อกะล่า\nกะเลิด\nกะเลียว\nกะแล\nกะโล่\nกะโลง\nกะวอกกะแวก\nกะวะ\nกะส้มชื่น\nกะสัง\nกะส้าหอย\nกะหนอกะแหน\nกะหน็องกะแหน็ง\nกะหนะ\nกะหนุงกะหนิง\nกะหร่อง\nกะหรอด\nกะหร็อมกะแหร็ม\nกะหราน\nกะหรี่\nกะหรี่ปั๊บ\nกะหลาป๋า\nกะหล่ำ\nกะหลี่\nกะหลีกะหลอ\nกะหลุกกะหลิก\nกะหำ\nกะหำแพะ\nกะหือ\nกะหูด\nกะเหรี่ยง\nกะเหลาะเปาะ\nกะแหยก\nกะแหะ\nกะโหลก\nกะโหล้ง\nกะไหล่\nกะอวม\nกะออม\nกะอาน\nกะอาม\nกะอูบ\nกัก\nกั๊ก\nกักกรา\nกักการุ\nกักขฬะ\nกัง\nกั้ง\nกังก้า\nกังเกียง\nกังขา\nกังฉิน\nกังฟู\nกังวล\nกังวาน\nกังสดาล\nกังไส\nกังหัน\nกัจฉปะ\nกัจฉะ\nกัจฉา\nกัญ\nกัญจุก\nกัญจุการา\nกัญชา\nกัญญา\nกัฐ\nกัณฏกะ\nกัณฐกะ\nกัณฐชะ\nกัณฐัศ\nกัณฐัศว์\nกัณฐา\nกัณฐี\nกัณฑ์\nกัณณ์\nกัณหา\nกัด\nกัตติกมาส\nกัตติกา\nกัตติเกยา\nกัตรา\nกัทลี\nกัน\nกั่น\nกั้น\nกันเกรา\nกันไกร\nกันชิง\nกันเชอ\nกันดาร\nกันดาล\nกันได\nกันต์\nกันตัง\nกันไตร\nกันทร\nกันทรากร\nกันภิรมย์\nกันเมียง\nกันย์\nกันยา\nกันยายน\nกันลง\nกันลอง\nกันแสง\nกั้นหยั่น\nกับ\nกับแก้\nกัป\nกัปตัน\nกัปปาสิก\nกัปปิยภัณฑ์\nกัปปิยะ\nกัมปนาท\nกัมประโด\nกัมปี\nกัมพล\nกัมพุช\nกัมพู\nกัมพูชา\nกัมโพช\nกัมมัชวาต\nกัมมัฏฐาน\nกัมมันตภาพรังสี\nกัมมันตรังสี\nกัมมาร\nกัมลาศ\nกัยวิกัย\nกัลชาญ\nกัลบก\nกัลป์\nกัลปนา\nกัลปพฤกษ์\nกัลปังหา\nกัลปาวสาน\nกัลปิต\nกัลเม็ด\nกัลยา\nกัลยาณมิตร\nกัลยาณี\nกัลออม\nกัศยป\nกัษณ\nกา\nก๋า\nกาก\nกากบาท\nกากะทิง\nกากะเยีย\nกากี\nกาง\nก้าง\nกางเกง\nกางเขน\nก๊าซ\nกาซะลอง\nกาญจนา\nกาฐ\nกาด\nก๊าด\nกาน\nก่าน\nก้าน\nก๊าน\nกานดา\nกานต์\nกานน\nก้านพร้าว\nกานพลู\nกาน้า\nกาบ\nก้าบ\nกาบู\nกาพย์\nกาเฟอีน\nกาแฟ\nกาม\nก้าม\nกามารมณ์\nกามินี\nกาเมสุมิจฉาจาร\nกาย\nก่าย\nกาเยน\nการ\nการณ์\nการ์ด\nการ์ตูน\nการบูร\nการย์\nการวิก\nการเวก\nการะเกด\nการะบุหนิง\nการัณย์\nการันต์\nการางหัวขวาน\nการิตการก\nการิตวาจก\nการุญ\nการุณย์\nกาเรการ่อน\nกาล\nกาลกรรณี\nกาลกิณี\nกาลจักร\nกาลัญญุตา\nกาลัญญู\nกาลัด\nกาลานุกาล\nกาลิก\nกาลี\nกาแล\nกาแล็กซี\nกาแล็กโทส\nกาว\nก้าว\nกาววาว\nกาวาง\nกาแวน\nกาศิก\nกาษฐะ\nกาษา\nกาสร\nกาสะ\nกาสา\nกาสาร\nกาสาวะ\nกาสิโน\nกาหล\nกาหลง\nกาหลา\nกาเหว่า\nกาไหล่\nกาฬ\nกาฬาวก\nกาฮัง\nกำ\nก่ำ\nกำกวม\nกำกัด\nกำกับ\nก้ำกึ่ง\nกำกูน\nก้ำเกิน\nกำเกียง\nกำคูน\nกำจร\nกำจัด\nกำจาย\nกำชับ\nกำชำ\nกำซาบ\nกำซำ\nกำด้น\nกำดัด\nกำดาล\nกำเดา\nกำธร\nกำนล\nกำนัน\nกำนัล\nกำเนิด\nกำบัง\nก่ำบึ้ง\nกำเบ้อ\nกำปอ\nกำปั่น\nกำผลา\nกำพง\nกำพด\nกำพต\nกำพร้า\nกำพราก\nกำพวด\nกำพอง\nกำพืด\nกำพุด\nกำพู\nกำเพลิง\nกำแพง\nกำภู\nกำมลาศน์\nกำมเลศ\nกำมะถัน\nกำมะลอ\nกำมะหยี่\nกำมะหริด\nกำมังละการ\nกำมังวิลิต\nกำมัชพล\nกำยาน\nกำยำ\nกำรอ\nกำราบ\nกำราล\nกำเริบ\nกำไร\nกำลัง\nกำลุง\nกำเลา\nกำไล\nกำสรด\nกำสรวล\nกำหนด\nกำหนัด\nกำเหน็จ\nกำแหง\nกิก\nกิ๊ก\nกิ่ง\nกิ้งก่า\nกิ้งกือ\nกิ้งโครง\nกิจ\nกิจจะ\nกิจจา\nกิดาการ\nกิดาหยัน\nกิตติ\nกิตติมศักดิ์\nกิน\nกินนร\nกินปลี\nกินเปี้ยว\nกินริน\nกินรี\nกิ๊บ\nกิมตึ๋ง\nกิมิชาติ\nกิมิวิทยา\nกิโมโน\nกิโยตีน\nกิระ\nกิริณี\nกิรินท\nกิริเนศวร\nกิริยา\nกิเลน\nกิเลส\nกิโล\nกิโลมกะ\nกิ่ว\nกิ๋ว\nกี\nกี่\nกี้\nกี๊\nกี๋\nกีฏวิทยา\nกีด\nกีตาร์\nกีบ\nกีรติ\nกีฬา\nกึก\nกึง\nกึ่ง\nกึ๋น\nกุ\nกุก\nกุ๊ก\nกุกกุฏ\nกุกกุร\nกุกรรม\nกุ้ง\nกุงอน\nกุงาน\nกุโงก\nกุจี\nกุญแจ\nกุญชร\nกุฎ\nกุฎา\nกุฎี\nกุฎุมพี\nกุฏฐัง\nกุฏิ\nกุณฑ์\nกุณฑล\nกุณฑี\nกุณโฑ\nกุณาล\nกุณี\nกุด\nกุดัง\nกุดั่น\nกุดา\nกุทัณฑ์\nกุน\nกุ๊น\nกุนเชียง\nกุนที\nกุโนกามอ\nกุบ\nกุบกับ\nกุม\nกุ่ม\nกุมฝอย\nกุมภ์\nกุมภนิยา\nกุมภัณฑ์\nกุมภา\nกุมภิล\nกุมภีล์\nกุมเหง\nกุมาร\nกุมารา\nกุมารี\nกุมุท\nกุย\nกุ๊ย\nกุ๋ย\nกุยช่าย\nกุยเฮง\nกุรระ\nกุรุพินท์\nกุเรา\nกุล\nกุลา\nกุลาหล\nกุลี\nกุลีกุจอ\nกุเลา\nกุแล\nกุเวร\nกุศราช\nกุศล\nกุศโลบาย\nกุสุม\nกุสุมภ์\nกุสุมา\nกุสุมาลย์\nกุสุมิตลดาเวลลิตา\nกุหนี\nกุหนุง\nกุหร่า\nกุหล่า\nกุหลาบ\nกุแหละ\nกู\nกู่\nกู้\nกู๊ก\nกูฏ\nกูฏา\nกูณฑ์\nกูด\nกูบ\nกูปรี\nกูรมะ\nกูรมาวตาร\nเก\nเก้\nเก๊\nเก๋\nเกก\nเก๊ก\nเกกมะเหรก\nเก๊กฮวย\nเก้กัง\nเก็ง\nเก่ง\nเก้ง\nเก๋ง\nเกงกอย\nเก่งกาจ\nเกงเขง\nเก๋งเคง\nเก็จ\nเกจิอาจารย์\nเกชา\nเกณฑ์\nเกด\nเก็ด\nเกตุ\nเกน\nเก็บ\nเกม\nเกย\nเกยูร\nเกรง\nเกร็ง\nเกร็ด\nเกรน\nเกร่อ\nเกรอะ\nเกราะ\nเกริก\nเกริน\nเกริ่น\nเกรียก\nเกรียง\nเกรียด\nเกรียน\nเกรียบ\nเกรียม\nเกรียว\nเกรี้ยว\nเกเร\nเกล็ด\nเกลศ\nเกลอ\nเกลา\nเกล้า\nเกลาะ\nเกลี่ย\nเกลี้ย\nเกลียง\nเกลี้ยง\nเกลียด\nเกลียว\nเกลือ\nเกลื้อ\nเกลือก\nเกลื่อน\nเกลื้อน\nเกวัฏ\nเกวียน\nเกศ\nเกศว\nเกศวะ\nเกศา\nเกศินี\nเกศี\nเกษตร\nเกษม\nเกษียณ\nเกษียน\nเกษียร\nเกส\nเกสร\nเกสรี\nเกสา\nเกสี\nเก้อ\nเกอิชา\nเกะ\nเกะกะ\nเกา\nเก่า\nเก้า\nเก๋า\nเกาต์\nเกาทัณฑ์\nเกาบิล\nเกาลัด\nเกาลิน\nเกาไศย\nเกาหลี\nเกาเหลา\nเกาเหลียง\nเก้าอี้\nเกาะ\nเกิ้ง\nเกิด\nเกิน\nเกิบ\nเกีย\nเกียกกาย\nเกียง\nเกี่ยง\nเกี๋ยง\nเกียจ\nเกียด\nเกียน\nเกี้ยมไฉ่\nเกี้ยมอี๋\nเกียร์\nเกียรติ\nเกียรติ์\nเกี่ยว\nเกี้ยว\nเกี๊ยว\nเกี๊ยะ\nเกื้อ\nเกือก\nเกื้อกูล\nเกือบ\nแก\nแก่\nแก้\nแกง\nแก่ง\nแก้ง\nแก๊ง\nแกงได\nแกงแนง\nแกโดลิเนียม\nแกน\nแก่น\nแก๊ป\nแกม\nแก้ม\nแกมมา\nแกรก\nแกร่ง\nแกร็น\nแกรนิต\nแกรไฟต์\nแกร่ว\nแกระ\nแกล\nแกล้ง\nแกลน\nแกลบ\nแกล้ม\nแกลลอน\nแกลเลียม\nแกล้ว\nแกละ\nแกแล\nแกว\nแก้ว\nแกว่ง\nแก๊ส\nแกะ\nโก\nโก่\nโก้\nโก๋\nโกก\nโกกนุท\nโกกิลา\nโกโก้\nโกง\nโก่ง\nโกงกาง\nโก้งเก้ง\nโกงโก้\nโก้งโค้ง\nโกเชาว์\nโกญจนาท\nโกญจา\nโกฏิ\nโกฐ\nโกฐาส\nโกณะ\nโกดัง\nโกทัณฑ์\nโกน\nโก่น\nโก๋น\nโกมล\nโกมุท\nโกเมน\nโกเมศ\nโกย\nโกรก\nโกรกกราก\nโกรง\nโกร่ง\nโกร่งกร่าง\nโกรงเกรง\nโกร๋งเกร๋ง\nโกรญจ\nโกรต๋น\nโกรธ\nโกรธา\nโกร๋น\nโกรม\nโกรย\nโกรศ\nโกโรโกเต\nโกโรโกโรก\nโกโรโกโส\nโกลน\nโกลาหล\nโกไล\nโกวิท\nโกศ\nโกศล\nโกษม\nโกสน\nโกสัช\nโกสินทร์\nโกสีย์\nโกสุม\nโกไสย\nโกหก\nใกล้\nไก\nไก่\nไก๊\nไก๋\nไก่กอม\nไกพัล\nไกร\nไกรพ\nไกรลาส\nไกรศร\nไกรศรี\nไกรสร\nไกรสรี\nไกรสิทธิ\nไกล\nไกล่\nไกลาส\nไกว\nไกวัล\nขงจื๊อ\nขจร\nขจรจบ\nขจัด\nขจ่าง\nขจาย\nขจาว\nขจิต\nขจี\nขจุย\nขเจา\nขณะ\nขด\nขน\nข้น\nขนง\nขนด\nขนบ\nขนม\nขนอง\nขนอน\nขนอบ\nขนัด\nขนัน\nขนาก\nขนาง\nขนาด\nขนาน\nขนาบ\nขนาย\nขนำ\nขนิษฐ\nขนิษฐา\nขนุน\nขนุนนก\nขบ\nขบถ\nขบวน\nขบวร\nขม\nข่ม\nขมงโกรย\nขมวด\nขมวน\nขมอง\nขม่อม\nขมัง\nขมับ\nขมา\nขม้ำ\nขมิ้น\nขมิบ\nขมีขมัน\nขมึง\nขมึงทึง\nขมุ\nขมุกขมัว\nขมุบ\nขโมย\nขยด\nขยม\nขย่ม\nขยอก\nขยอง\nขย่อน\nขย้อน\nขยะ\nขยัก\nขยัน\nขยั้น\nขยับ\nขยาด\nขยาย\nขยำ\nขย้ำ\nขยิก\nขยิบ\nขยิ่ม\nขยี้\nขยุกขยิก\nขยุกขยุย\nขยุบ\nขยุบขยิบ\nขยุม\nขยุ้ม\nขยุย\nขรม\nขรรค์\nขรัว\nขริบ\nขรี\nขรึม\nขรุขระ\nขลบ\nขล้อ\nขลัง\nขลับ\nขลาด\nขลาย\nขลิบ\nขลุก\nขลุกขลัก\nขลุกขลิก\nขลุบ\nขลุม\nขลุ่ย\nขลู\nขลู่\nขวง\nข่วง\nขวด\nข่วน\nขวนขวาย\nขวบ\nขวย\nขวักไขว่\nขวัญ\nขวั้น\nขวับ\nขวับเขวียว\nขวา\nขวาก\nขวาง\nขว้าง\nขวาด\nขวาน\nขวายขวน\nขวาว\nขว้าว\nขวิด\nขอ\nข่อ\nข้อ\nของ\nข้อง\nขอด\nขอน\nข้อน\nขอบ\nขอม\nข่อย\nข้อย\nข่อยหยอง\nขะแจะ\nขะเน็ด\nขะมอมขะแมม\nขะมักเขม้น\nขะมุกขะมอม\nขะยิก\nขะยุก\nขะเย้อแขย่ง\nขัค\nขัง\nขังขอก\nขัช\nขัณฑสกร\nขัณฑสีมา\nขัด\nขัดมอน\nขัตติยมานะ\nขัน\nขั้น\nขันติ\nขันตี\nขันโตก\nขันที\nขันธ์\nขันธาวาร\nขับ\nขัว\nขั้ว\nขา\nข่า\nข้า\nขาก\nขาก๊วย\nขาง\nข่าง\nข้าง\nขาณุ\nขาด\nขาทนียะ\nขาน\nขาบ\nข้าพเจ้า\nขาม\nข่าม\nข้าม\nขาย\nข่าย\nขาล\nขาว\nข่าว\nข้าว\nข้าวอังกุลี\nขำ\nขิก\nขิง\nขิงแกลง\nขิงแครง\nขิด\nขิปสัทโท\nขิม\nขี่\nขี้\nขี้เข็บ\nขีณาสพ\nขีด\nขี้ตังนี\nขีปนาวุธ\nขี้ยอก\nขีระ\nขึง\nขึ้ง\nขึ้น\nขึ้นฉ่าย\nขืน\nขื่น\nขื่อ\nขุก\nขุด\nขุน\nขุ่น\nขุนเพ็ด\nขุม\nขุย\nขู่\nขูด\nเข\nเข้\nเขก\nเข็ง\nเข่ง\nเขจร\nเข็ญ\nเข็ด\nเขดา\nเขต\nเขน\nเข็น\nเข่น\nเขนง\nเขน็ด\nเขนย\nเขบ็จขบวน\nเขบ็ต\nเขม\nเข็ม\nเข้ม\nเข้มขาบ\nเขม็ง\nเขม็ดแขม่\nเขม่น\nเขม้น\nเขม้นขะมัก\nเขมร\nเขมา\nเขม่า\nเขมือบ\nเขย\nเขยก\nเขย่ง\nเขย้อแขย่ง\nเขย่า\nเขยิน\nเขยิบ\nเขยื้อน\nเขรอะ\nเขลง\nเขลอะ\nเขละ\nเขลา\nเขลาะ\nเขว\nเขษม\nเขฬะ\nเขะขะ\nเขา\nเข่า\nเข้า\nเขิง\nเขิน\nเขิบ\nเขี่ย\nเขียง\nเขียด\nเขียดตะปาด\nเขียน\nเขี่ยน\nเขียม\nเขียว\nเขี้ยว\nเขียะ\nเขือ\nเขือง\nเขื่อง\nเขื่อน\nเขือม\nแข\nแข้\nแขก\nแข็ง\nแข่ง\nแข้ง\nแขน\nแข่น\nแข้น\nแขนง\nแขม\nแขม็บ\nแขม่ว\nแขยง\nแขย่ง\nแขวก\nแขวง\nแขวน\nแขวะ\nโข\nโขก\nโขง\nโข่ง\nโขด\nโขดง\nโขน\nโขนง\nโขม\nโขมง\nโขมด\nโขยก\nโขยง\nโขย่ง\nโขยด\nโขลก\nโขลง\nโขลน\nโขษม\nไข\nไข่\nไข้\nไขว่\nไขว้\nคคนะ\nคคนัมพร\nคคนางค์\nคคนานต์\nคง\nคงคา\nคงไคย\nคช\nคชาชาติ\nคชาชีพ\nคชาธาร\nคชาภรณ์\nคณนา\nคณบดี\nคณะ\nคณาจารย์\nคณาธิการ\nคณาธิปไตย\nคณานับ\nคณิกา\nคณิต\nคเณศ\nคด\nคดี\nคติ\nคทา\nคน\nค้น\nคนทา\nคนทิสอ\nคนที\nคนโท\nคนธ์\nคันธ์\nคนธรรพ์\nคเนจร\nคบ\nคม\nคมน์\nคมนาการ\nคมนาคม\nคมิกภัต\nครก\nครบ\nครรชิต\nครรภ\nครรภ์\nครรลอง\nครรโลง\nครรไล\nครวญ\nครวี\nครหา\nครอก\nครอง\nครองแครง\nคร่อเงาะ\nคร่อเทียน\nครอบ\nคร่อม\nคระเมิม\nคระแลง\nคระไล\nคระแวง\nคระหน\nคระหวน\nคระหาย\nคระโหย\nครั่ง\nครั้ง\nครัดเคร่ง\nครัน\nครั่น\nครั้น\nครับ\nครัว\nครา\nคร่า\nคราก\nคราง\nคราญ\nคราด\nคร้าน\nคราบ\nคราม\nคร้าม\nครามครัน\nคราว\nคร่าว\nคราส\nครำ\nคร่ำ\nคร่ำเคร่ง\nคริปทอน\nคริสต์\nครีบ\nครีม\nครีษมายัน\nครึ\nครึกครื้น\nครึกโครม\nครึ่ง\nครึ่ด\nครึน\nครึ้ม\nครืด\nครืน\nครื้น\nครืนครั่น\nครื้นครั่น\nครื้นครึก\nครื้นเครง\nครือ\nครุ\nครุคระ\nครุฑ\nครุ่น\nครุมเครือ\nครุย\nครุวนา\nครู\nครู่\nครูด\nคฤโฆษ\nคฤนถ์\nคฤหบดี\nคฤหัสถ์\nคฤหาสน์\nคลวง\nคลอ\nคล้อ\nคลอก\nคลอง\nคล่อง\nคล้อง\nคลอด\nคลอน\nคล้อย\nคลอรีน\nคลอโรฟอร์ม\nคลอโรฟีลล์\nคละ\nคละคลุ้ง\nคลัก\nคลั่ก\nคลัง\nคลั่ง\nคลัตช์\nคลับคล้าย\nคลับคลา\nคลา\nคล้า\nคลางแคลง\nคลาด\nคลาน\nคลาย\nคล้าย\nคล้ายคลึง\nคล่าว\nคลำ\nคล่ำ\nคล้ำ\nคลิ้งโคลง\nคลิด\nคลินิก\nคลี\nคลี่\nคลึง\nคลื่น\nคลุก\nคลุ้ง\nคลุบ\nคลุม\nคลุ่ม\nคลุ้ม\nควง\nควณ\nควน\nควบ\nควย\nควร\nควัก\nควั่ก\nควั่งคว้าง\nควัน\nควั่น\nคว้า\nควาก\nคว้าง\nควาญ\nควาน\nคว้าน\nความ\nควาย\nคว่าว\nคว่ำ\nควินิน\nควิวคว่าง\nคหกรรม\nคหกรรมศาสตร์\nคหบดี\nคหัฐ\nคอ\nค้อ\nคอก\nค็อกคัส\nค็อกเทล\nคอเคซอยด์\nค่องอ้อย\nคอด\nคอแดง\nคอน\nค่อน\nค้อน\nคอนกรีต\nคอนเดนเซอร์\nคอนแวนต์\nคอนเสิร์ต\nคอม\nค่อม\nค้อม\nคอมพิวเตอร์\nคอมมานโด\nคอมมิวนิสต์\nคอย\nค่อย\nค้อย\nคอยล์\nคอร์ด\nคอแลน\nคอสติกโซดา\nคะ\nค่ะ\nคะไขว่\nคะค้อย\nคะคาน\nคะนน\nคะนอง\nคะน้า\nคะนึง\nคะเน\nคะเนงร้าย\nคะเน็ด\nคะแนน\nคะมำ\nคะยั้นคะยอ\nคะเยอ\nคัก\nคั่ก\nคัคนะ\nคัคนัมพร\nคัคนางค์\nคัคนานต์\nคั่ง\nคังไคย\nคัจฉ\nคัณฑมาลา\nคัณฑสูตร\nคัด\nคัดเค้า\nคัดมอน\nคัดเม็ง\nคัทลียา\nคัน\nคั่น\nคั้น\nคันถรจนาจารย์\nคันธกุฎี\nคันธมาทน์\nคันธารราษฎร์\nคับ\nคับค้อน\nคับคา\nคับแค\nคัพภ์\nคัมภีร์\nคัมภีรภาพ\nคัล\nคั่ว\nคา\nค่า\nค้า\nค่าคบ\nคาง\nค่าง\nค้าง\nคางคก\nค้างคาว\nคาด\nคาถา\nคาทอลิก\nคาน\nค้าน\nคาบ\nคาพยุต\nคาม\nคามวาสี\nคามณีย์\nคามภีร์\nคาย\nค่าย\nคาร์บอน\nคาร์บอเนต\nคาร์บอลิก\nคาร์บูเรเตอร์\nคาร์โบรันดัม\nคาร์โบไฮเดรต\nคารม\nคารวะ\nคาราเต้\nคาราวาน\nคาว\nค่าว\nค้าว\nคาวตอง\nคาวี\nคาวุต\nคาส\nคำ\nค่ำ\nค้ำ\nคำนวณ\nคำนวร\nคำนับ\nคำนัล\nคำนึง\nคำนูณ\nคำฝอย\nคำเพลิง\nคำรน\nคำรบ\nคำราม\nคำแสด\nคำแหง\nคำโอง\nคิก\nคิง\nคิด\nคิมหันต์\nคิรี\nคิลาน\nคิลานะ\nคิว\nคิ้ว\nคี่\nคีต\nคีบ\nคีม\nคีรี\nคีรีบูน\nคึก\nคึ่ก\nคึกคัก\nคืน\nคืบ\nคือ\nคุ\nคุก\nคุกกี้\nคุกคาม\nคุกพาทย์\nคุ้ง\nคุณ\nคุด\nคุดทะราด\nคุต\nคุตติ\nคุ่น\nคุ้น\nคุป\nคุปต์\nคุปติ\nคุม\nคุ่ม\nคุ้ม\nคุย\nคุ้ย\nคุยหฐาน\nคุยหประเทศ\nคุรุ\nคุลา\nคุลิก่า\nคุลีการ\nคุหา\nคู\nคู่\nคู้\nคูณ\nคูถ\nคูน\nคูปอง\nคูเรียม\nคูหา\nเค้ก\nเค้เก้\nเค้ง\nเคณฑะ\nเคด\nเค็ด\nเคน\nเค้น\nเคเบิล\nเค็ม\nเคมี\nเคย\nเครง\nเคร่ง\nเครงครา\nเครงครำ\nเครดิต\nเครน\nเครา\nเคร่า\nเคราหณี\nเคราะห์\nเครียด\nเครียว\nเครือ\nเครื่อง\nเคล้ง\nเคล็ด\nเคล้น\nเคล้า\nเคล่าคล่อง\nเคลิบเคลิ้ม\nเคลิ้ม\nเคลีย\nเคลื่อน\nเคลือบ\nเคว้ง\nเคหะ\nเคหา\nเคอะ\nเค้า\nเคาน์เตอร์\nเคารพ\nเคาะ\nเคาะแคะ\nเคียง\nเคียด\nเคียน\nเคียม\nเคี่ยม\nเคียร\nเคียว\nเคี่ยว\nเคี้ยว\nเคือง\nแค\nแค่\nแค้\nแคแล\nแคดเมียม\nแค็ตตาล็อก\nแคแตร\nแคโทด\nแคน\nแค่น\nแค้น\nแคบ\nแคบหมู\nแคปซูล\nแคม\nแคร่\nแครก\nแครง\nแคระ\nแคลคูลัส\nแคลง\nแคลเซียม\nแคลน\nแคล้ว\nแคล่วคล่อง\nแคลอรี\nแคลิฟอร์เนียม\nแคว\nแควก\nแคว้น\nแคแสด\nแคะ\nโค\nโคก\nโคเคน\nโค่ง\nโค้ง\nโคจร\nโคเซแคนต์\nโคไซน์\nโคตร\nโคแทนเจนต์\nโคธา\nโคน\nโค่น\nโคบอลต์\nโคปผกะ\nโคม\nโคม่า\nโครก\nโครกคราก\nโครง\nโคร่ง\nโคร่งคร่าง\nโครม\nโครเมียม\nโครโมโซม\nโคราช\nโครำ\nโคล\nโคลง\nโคลน\nโควตา\nโคออร์ดิเนต\nใคร\nใคร่\nใคร่ครวญ\nไค\nไค้\nไคร้\nไคร้เครือ\nไคล\nไคล้\nฆนะ\nฆราวาส\nฆ้อง\nฆ่า\nฆาต\nฆาน\nฆานินทรีย์\nเฆี่ยน\nโฆรวิส\nโฆษก\nโฆษณา\nโฆษะ\nโฆษิต\nงก\nงง\nงด\nงดงาม\nงบ\nงม\nงวง\nง่วง\nงวด\nง่วน\nง้วน\nงวยงง\nงอ\nง้อ\nงอก\nงอกแงก\nง่อกแง่ก\nง่อง\nง่องแง่ง\nงอแง\nงอด\nงอดแงด\nงอน\nง่อน\nง่อนแง่น\nงอนหง่อ\nงอบ\nงอม\nง้อม\nงอย\nง่อย\nงัก\nงั่ก\nงั่ง\nงัด\nงัน\nงันงก\nงับ\nงัว\nงั่ว\nงัวเงีย\nงา\nง่า\nง้าง\nงาน\nง่าน\nงาบ\nงาม\nง่าม\nงาย\nง่าย\nง้าว\nงำ\nง่ำ\nง้ำ\nงิ้ว\nงี่เง่า\nงีบ\nงึก\nงึน\nงึม\nงุด\nงุนงง\nงุ่นง่าน\nงุบ\nงุบงิบ\nงุ้ม\nงุ่มง่าม\nงุย\nงู\nงูบ\nงูสวัด\nเงก\nเงย\nเงอะ\nเงอะงะ\nเงา\nเง่า\nเง้า\nเงาะ\nเงิน\nเงี่ยง\nเงี่ยน\nเงียบ\nเงี้ยว\nเงี่ยหู\nเงื้อ\nเงือก\nเงื่อง\nเงือด\nเงื่อน\nเงือบ\nเงื้อม\nแง\nแง่\nแง่ง\nแง่น\nแง้ม\nแงะ\nโง\nโง่\nโงก\nโงกเงก\nโงง\nโง่ง\nโง้ง\nโงงเงง\nโง่งเง่ง\nโงเง\nโงน\nโงนเงน\nไง้\nจก\nจง\nจ่ง\nจงกรม\nจงกล\nจงกลนี\nจงโคร่ง\nโจงโคร่ง\nจงอร\nจงอาง\nจด\nจดุรงค์\nจตุปัจจัย\nจตุลังคบาท\nจตุโลกบาล\nจตุสดมภ์\nจตุตถ\nจตุตถี\nจตุร\nจตุรงค์\nจตุรพักตร์\nจตุรพิธ\nจตุรพิธพร\nจน\nจบ\nจม\nจ่ม\nจมร\nจมรี\nจมูก\nจยุติ\nจร\nจรณะ\nจรด\nจรรจา\nจรรโจษ\nจรรม\nจรรยา\nจรรโลง\nจรลี\nจรวจ\nจรวด\nจรส\nจรอก\nจระเข้\nจระนำ\nจระบี\nจรัล\nจรัส\nจราจร\nจราญ\nจริก\nจริง\nจริต\nจริม\nจริยธรรม\nจริยวัตร\nจริยาวัตร\nจริยศาสตร์\nจริยศึกษา\nจริยา\nจรุง\nจรูง\nจรูญ\nจเร\nจล\nจลนพลศาสตร์\nจลนศาสตร์\nจลนี\nจลาจล\nจวก\nจ๊วก\nจวง\nจ้วง\nจ๋วง\nจวด\nจวน\nจวบ\nจวัก\nจอ\nจ่อ\nจ้อ\nจ๋อ\nจอก\nจ้อก\nจ๊อก\nจ้อกแจ้ก\nจอง\nจ้อง\nจ๋อง\nจ้องเต\nจองเปรียง\nจ้องหน่อง\nจองหอง\nจอแจ\nจ๋อแจ๋\nจอด\nจอน\nจ้อน\nจอนจ่อ\nจอบ\nจอม\nจ่อม\nจอมสุรางค์\nจ่อย\nจ้อย\nจ๋อย\nจอแส\nจะ\nจ้ะ\nจ๊ะ\nจ๋ะ\nจะกละ\nจะกลาม\nจะกูด\nจะขาบ\nจะเข้\nจะเข็บ\nจะงอย\nจะจะ\nจ๊ะจ๋า\nจะแจ้ง\nจะแจ่ม\nจะละเม็ด\nจะละหวั่น\nจัก\nจั้ก\nจักกาย\nจั๊กกิ้ม\nจักขุ\nจักจั่น\nจักจี้\nจั๊กจี้\nจั๊กเดียม\nจักร\nจักรพาก\nจักรวาก\nจักริน\nจักรี\nจั๊กเล้อ\nจักษุ\nจักแหล่น\nจัง\nจั้ง\nจั๋ง\nจังกวด\nจังกอบ\nจังก้า\nจังกูด\nจังโกฏก์\nจังงัง\nจั้งมั่ง\nจังไร\nจังหนับ\nจังหรีด\nจังหวะ\nจังหวัด\nจังหัน\nจัญไร\nจัณฑ์\nจัณฑาล\nจัด\nจัตตาฬีสะ\nจัตวา\nจัตุ\nจัตุรงค์\nจัตุรัส\nจัตุลังคบาท\nจัตุโลกบาล\nจัตุสดมภ์\nจัน\nจั่น\nจันโจษ\nจั่นดิน\nจันท์\nจันทน์\nจันทร์\nจันทรคติ\nจันทรคราส\nจันทรุปราคา\nจันทรเม็ด\nจันทวาร\nจันทัน\nจันอับ\nจับ\nจับกัง\nจับฉ่าย\nจับเดิม\nจับปิ้ง\nจับยี่กี\nจัมบก\nจัมปกะ\nจัมปา\nจัมมะ\nจัว\nจั่ว\nจั๊วะ\nจา\nจ่า\nจ้า\nจ๋า\nจาก\nจากพาก\nจาคะ\nจาคี\nจาง\nจ่าง\nจ้าง\nจางปาง\nจางวาง\nจาด\nจาตุรงค์\nจาตุรนต์\nจาตุรันต์\nจาน\nจ้าน\nจาบ\nจาบัล\nจาบัลย์\nจาป\nจาม\nจ่ามงกุฎ\nจามจุรี\nจามร\nจามรี\nจามีกร\nจ่าย\nจาร\nจ่ารง\nจารวาก\nจาระไน\nจาระบี\nจาริก\nจารึก\nจารี\nจารีต\nจารุ\nจ้าละหวั่น\nจาว\nจ้าว\nจ่าหวัก\nจำ\nจ้ำ\nจำกัด\nจำงาย\nจ้ำจี้\nจำเจ\nจำเดิม\nจำทวย\nจำนง\nจำนน\nจำนรรจ์\nจำนรรจา\nจำนวน\nจำนอง\nจำนัล\nจำนำ\nจำเนียน\nจำเนียม\nจำเนียร\nจำแนก\nจำบ่ม\nจำบัง\nจ้ำเบ้า\nจำปา\nจำปาดะ\nจำปี\nจำปูน\nจำพวก\nจำเพาะ\nจ้ำม่ำ\nจำรัส\nจำราญ\nจำรูญ\nจำเริญ\nจำเรียง\nจำลอง\nจำเลย\nจำเลาะ\nจำแลง\nจำแล่น\nจำหนับ\nจ๋ำหนับ\nจำหน่าย\nจำหระ\nจำหล่อ\nจำหลัก\nจำเหียง\nจำอวด\nจิ\nจิก\nจิ้งโกร่ง\nจิ้งจก\nจิงจ้อ\nจิ้งจอก\nจิงจัง\nจิ้งจัง\nจิงโจ้\nจิ้งหรีด\nจิ้งเหลน\nจิต\nจิตกาธาน\nจิตต์\nจิตร\nจิตรจุล\nจิตระ\nจิตรา\nจินเจา\nจินดา\nจินดาหนา\nจินดาหรา\nจินต์\nจิบ\nจิปาถะ\nจิ่ม\nจิ้ม\nจิ้มก้อง\nจิ้มลิ้ม\nจิรกาล\nจิ๋ว\nจี\nจี่\nจี้\nจี๋\nจี๋จ้อ\nจีแจ๊บ\nจี๊ด\nจีน\nจีนแส\nจีบ\nจีโบ\nจีม\nจีวร\nจึง\nจึ่ง\nจึ้ง\nจืด\nจุ\nจุก\nจุ๊กกรู๊\nจุกจิก\nจุกชี\nจุกผาม\nจุกโรหินี\nจุ่ง\nจุ๋งจิ๋ง\nจุฑา\nจุณ\nจุณณียบท\nจุด\nจุติ\nจุตูปปาตญาณ\nจุทส\nจุน\nจุ่น\nจุ้น\nจุนจู๋\nจุ้นจู๊\nจุนทการ\nจุนสี\nจุบ\nจุ๊บ\nจุบจิบ\nจุ๊บแจง\nจุ่ม\nจุ้ม\nจุ๋มจิ๋ม\nจุมพฏ\nจุมพรวด\nจุมพล\nจุมพิต\nจุมโพล่\nจุ้ย\nจุรณ\nจูรณ\nจุรี\nจุไร\nจุล\nจุลจอมเกล้า\nจุลวงศ์\nจุฬา\nจุฬาราชมนตรี\nจุฬาลัมพา\nจุฬาลำพา\nจู\nจู่\nจู้\nจู๋\nจูง\nจู้จี้\nจู๋จี๋\nจู๊ด\nจูบ\nเจ\nเจ๊ก\nเจ่ง\nเจ้ง\nเจ๊ง\nเจ๋ง\nเจ็ด\nเจ็ดตะคลี\nเจดีย์\nเจดียสถาน\nเจต\nเจตนา\nเจตพังคี\nเจตมูลเพลิง\nเจตสิก\nเจโตวิมุติ\nเจน\nเจ็บ\nเจรจา\nเจริญ\nเจริด\nเจรียง\nเจลียง\nเจว็ด\nเจษฎา\nเจ๊สัว\nเจอ\nเจ่อ\nเจ๋อ\nเจ๋อเจ๊อะ\nเจอร์เมเนียม\nเจอะ\nเจา\nเจ่า\nเจ้า\nเจ๊า\nเจาะ\nเจิ่ง\nเจิด\nเจิ่น\nเจิม\nเจีย\nเจียง\nเจียด\nเจียน\nเจี๋ยน\nเจี๊ยบ\nเจียม\nเจี๋ยมเจี้ยม\nเจียร\nเจียระไน\nเจียระบาด\nเจียว\nเจี๊ยวจ๊าว\nเจือ\nเจื่อน\nเจื้อย\nเจือสม\nแจ\nแจ้\nแจ๋\nแจก\nแจกัน\nแจง\nแจ่ง\nแจ้ง\nแจงลอน\nแจ๊ด\nแจ๊ดแจ๋\nแจตร\nแจ้น\nแจบ\nแจ่ม\nแจรง\nแจว\nแจ่ว\nแจ้ว\nแจ๋ว\nแจะ\nโจก\nโจ๊ก\nโจง\nโจ่งครึ่ม\nโจ๋งครึ่ม\nโจ่งครุ่ม\nโจ๋งเจ๋ง\nโจ่งแจ้ง\nโจท\nโจทก์\nโจทนา\nโจทย์\nโจน\nโจม\nโจร\nโจล\nโจษ\nโจษจัน\nใจ\nไจ\nไจ้\nฉก\nฉกรรจ์\nฉกษัตริย์\nฉกาจ\nฉกามาพจร\nฉกามาวจร\nฉง\nฉงน\nฉงาย\nฉทึง\nฉนวน\nฉนัง\nฉนาก\nฉนำ\nฉบัง\nฉบัด\nฉบับ\nฉบำ\nฉม\nฉมบ\nฉมวก\nฉมวย\nฉม่อง\nฉมัง\nฉมัน\nฉมา\nฉมำ\nฉล\nฉลวย\nฉลอง\nฉลอม\nฉลัก\nฉลับ\nฉลาก\nฉลาง\nฉลาด\nฉลาม\nฉลาย\nฉลิว\nฉลีก\nฉลุ\nฉลู\nฉวย\nฉวะ\nฉวัดเฉวียน\nฉวาง\nฉวี\nฉศก\nฉ้อ\nฉอก\nฉ่อง\nฉอด\nฉ่อย\nฉอเลาะ\nฉะ\nฉะฉาด\nฉะฉาน\nฉะฉ่ำ\nฉะฉี่\nฉะเฉื่อย\nฉะนั้น\nฉะนี้\nฉะอ้อน\nฉักกะ\nฉัฐ\nฉัด\nฉัตร\nฉัททันต์\nฉัน\nฉันท\nฉันท์\nฉันทะ\nฉันทา\nฉันทาคติ\nฉันทานุมัติ\nฉับ\nฉัพพรรณรังสี\nฉัยยา\nฉ่า\nฉาก\nฉาง\nฉ่าง\nฉ่าฉาว\nฉาด\nฉาดฉาน\nฉาตกภัย\nฉาน\nฉาบ\nฉาบฉวย\nฉาย\nฉายา\nฉาว\nฉ่ำ\nฉำฉา\nฉำแฉะ\nฉิ่ง\nฉิน\nฉินท์\nฉินทฤกษ์\nฉิบ\nฉิมพลี\nฉิว\nฉี่\nฉีก\nฉีด\nฉุ\nฉุก\nฉุด\nฉุน\nฉุป\nฉุป\nฉุย\nฉุยฉาย\nฉู่\nฉู่ฉี่\nฉูด\nฉูดฉาด\nเฉ\nเฉก\nเฉโก\nเฉ่ง\nเฉด\nเฉท\nเฉนียน\nเฉพาะ\nเฉย\nเฉลย\nเฉลว\nเฉลา\nเฉลิม\nเฉลี่ย\nเฉลียง\nเฉลี่ยง\nเฉลียบ\nเฉลียว\nเฉวียง\nเฉวียน\nเฉอะแฉะ\nเฉา\nเฉาก๊วย\nเฉาฮื้อ\nเฉาะ\nเฉิด\nเฉิบ\nเฉียง\nเฉียงพร้านางแอ\nเฉียด\nเฉียบ\nเฉียว\nเฉี่ยว\nเฉือน\nเฉื่อย\nแฉ\nแฉ่\nแฉก\nแฉง\nแฉ่ง\nแฉลบ\nแฉล้ม\nแฉะ\nโฉ\nโฉ่\nโฉเก\nโฉ่งฉ่าง\nโฉงเฉง\nโฉด\nโฉนด\nโฉบ\nโฉเบ๊\nโฉม\nโฉลก\nไฉน\nไฉไล\nชก\nชคัตตรัย\nชง\nชงโค\nชงฆ์\nชงฆา\nชงโลง\nชฎา\nชฎามังษี\nชฎามังสี\nชฎิล\nชด\nชน\nชนก\nชนนี\nชนม์\nชนวน\nชนะ\nชนัก\nชนา\nชนาง\nชนิด\nชเนตตี\nชบา\nชม\nชมดชม้อย\nชมนาด\nชมพู\nชมพู่\nชมรม\nชม้อย\nชม้าย\nชไม\nชยา\nชโย\nชรทึง\nชรริน\nชรอุ่ม\nชระล้ำ\nชระลุ\nชระอาบ\nชระเอม\nชรัด\nชรา\nชล\nชโลง\nชโลม\nช่วง\nชวด\nชวน\nชวย\nช่วย\nชวร\nชวลิต\nชวา\nชวาล\nชวาลา\nช่อ\nชอก\nช็อก\nช็อกโกเลต\nช็อกโกแลต\nชอง\nช่อง\nช้อง\nชองระอา\nชอน\nช่อน\nช้อน\nชอบ\nชอม\nช้อย\nชอล์ก\nชอ่ำ\nชอุ่ม\nชะ\nชะคราม\nชะงอก\nชะง่อน\nชะงัก\nชะงัด\nชะง้ำ\nชะงุ้ม\nชะเง้อ\nชะเงื้อม\nชะแง้\nชะโงก\nชะฉ่า\nชะช่อง\nชะชะ\nชะช้า\nชะโด\nชะตา\nชะต้า\nชะนี\nชะเนาะ\nชะเนียง\nชะพลู\nชะเพลิง\nชะมด\nชะมบ\nชะมวง\nชะมัง\nชะมัด\nชะแม่\nชะรอย\nชะลอ\nชะลอม\nชะล่า\nชะลาน\nชะลิน\nชะลูด\nชะเลง\nชะเลย\nชะแล็ก\nชะแลง\nชะวาก\nชะวาด\nชะเวิกชะวาก\nชะแวง\nชะอม\nชะอ้อน\nชะเอม\nชะโอน\nชัก\nชักคราม\nชักช้า\nชัง\nชั่ง\nชังคา\nชังฆ\nชัชวาล\nชัฏ\nชัด\nชัดช้า\nชัน\nชั้น\nชันกาด\nชันชี\nชันตุ\nชันนะตุ\nชันนุ\nชันโรง\nชันษา\nชันสูตร\nชัปนะ\nชัพ\nชัมพูนท\nชัย\nชัยพฤกษ์\nชัยภูมิ\nชัลลุกา\nชั่ว\nชั้ว\nชัวชม\nชา\nช้า\nชาคระ\nชาคริต\nชาคริยานุโยค\nช่าง\nช้าง\nช้าช่อน\nชาญ\nชาด\nชาดก\nชาต\nชาตบุษย์\nชาตรี\nชาตะ\nชาตา\nชาติ\nชาน\nชานุ\nช้าปี่\nชาปีไหน\nช้าแป้น\nช้าพลู\nชาม\nชามพูนท\nชามาดร\nชามาดา\nชามาตุ\nชาย\nชายา\nชาล\nชาลา\nชาลินี\nช้าเลือด\nชาว\nชาวี\nชำ\nช่ำ\nช้ำ\nชำงัด\nชำงาย\nช่ำชอง\nชำนะ\nชำนัญ\nชำนัน\nชำนาญ\nชำนิ\nชำเนียร\nชำมะนาด\nชำมะเลียง\nชำร่วย\nชำระ\nช้ำรั่ว\nชำรุด\nชำเรา\nชำเราะ\nชำแรก\nชำแระ\nชำเลือง\nชำแหละ\nชิ\nชิง\nชิ่ง\nชิงชัน\nชิงช้า\nชิงช้าชาลี\nชิงชี่\nชิงฮื้อ\nชิชะ\nชิชิ\nชิณณะ\nชิด\nชิเดนทรีย์\nชิต\nชิตินทรีย์\nชิน\nชิ้น\nชินโต\nชิโนรส\nชิม\nชิมแปนซี\nชิยา\nชิรณะ\nชิระ\nชิวหา\nชิสา\nชี\nชี่\nชี้\nชีปะขาว\nชีผะขาว\nชีผ้าขาว\nชีพ\nชีฟอง\nชีรณ\nชีระ\nชีวเคมี\nชีวประวัติ\nชีวภาพ\nชีววิทยา\nชีวะ\nชีวัน\nชีวา\nชีวาตม์\nชีวาลัย\nชีวิต\nชีวิตักษัย\nชีวิน\nชีวี\nชืด\nชื่น\nชื้น\nชื่อ\nชุก\nชุกชี\nชุ้ง\nชุณห\nชุด\nชุติ\nชุน\nชุบ\nชุม\nชุ่ม\nชุมนุม\nชุมพร\nชุมพา\nชุมแพรก\nชุมรุม\nชุมแสง\nชุมเห็ด\nชุ่ย\nชุลมุน\nชุลี\nชุษณะ\nชู\nชู้\nเช็ค\nเช้ง\nเช้งวับ\nเชงเลง\nเช็ด\nเชน\nเช่น\nเชย\nเชลง\nเชลย\nเชลแล็ก\nเชลียง\nเชวง\nเชษฐะ\nเชษฐา\nเชอ\nเช่า\nเช้า\nเชาว์\nเชาวน์\nเชิง\nเชิญ\nเชิด\nเชิ้ต\nเชียง\nเชี่ยน\nเชียบ\nเชี่ยม\nเชียร\nเชียร์\nเชียว\nเชี่ยว\nเชื่อ\nเชื้อ\nเชือก\nเชื่อง\nเชือด\nเชือน\nเชื่อม\nแช\nแช่\nแช่ง\nแชงมา\nแชบ๊วย\nแช่ม\nแชร์\nแชล่ม\nแชสซี\nแชะ\nโชก\nโชค\nโชงโลง\nโชดก\nโชดึก\nโชต\nโชตก\nโชติ\nโชติก\nโชน\nโชมโรม\nโชย\nโชยงการ\nโชยชาย\nโชยติส\nโชว์\nใช่\nใช้\nไช\nไชนะ\nไชย\nไชโย\nซก\nซ่ก\nซงดำ\nซ่งฮื้อ\nซด\nซน\nซ้น\nซบ\nซม\nซวดเซ\nซวน\nซวย\nซอ\nซอก\nซอง\nซ่อง\nซ้อง\nซองแมว\nซ้องแมว\nซอน\nซ่อน\nซ้อน\nซอม\nซ่อม\nซ้อม\nซอมซ่อ\nซอย\nซอส\nซัก\nซักส้าว\nซัง\nซั้ง\nซัด\nซับ\nซัลฟา\nซั้ว\nซา\nซ่า\nซาก\nซาง\nซ่าง\nซาด\nซาน\nซ่าน\nซาบซ่าน\nซาบซึ้ง\nซ่าโบะ\nซาแมเรียม\nซ้าย\nซาลาเปา\nซาว\nซ่าหริ่ม\nซำ\nซ้ำ\nซิ\nซี\nซิก\nซิกข์\nซิกซี้\nซิกแซ็ก\nซิการ์\nซิงโคนา\nซิ่น\nซินนามิก\nซินแส\nซิบ\nซิป\nซิฟิลิส\nซิลิคอน\nซิว\nซี่\nซีก\nซีเซียม\nซีด\nซี้ด\nซีนอน\nซีป่าย\nซีเมนต์\nซีเรียม\nซีลีเนียม\nซีอิ๊ว\nซึก\nซึง\nซึ่ง\nซึ้ง\nซึม\nซื่อ\nซื้อ\nซุก\nซุง\nซุน\nซุบ\nซุป\nซุ่ม\nซุ้ม\nซุ่มซ่าม\nซุย\nซู่\nซูโครส\nซูด\nซู้ด\nซูดซาด\nซูบ\nเซ\nเซ็ก\nเซแคนต์\nเซ็ง\nเซ่ง\nเซ้ง\nเซ็งแซ่\nเซต\nเซน\nเซ็น\nเซ่น\nเซนติกรัม\nเซนติเกรด\nเซนติเมตร\nเซนติลิตร\nเซปักตะกร้อ\nเซราะ\nเซรุ่ม\nเซลเซียส\nเซลล์\nเซลลูลอยด์\nเซลลูโลส\nเซ่อ\nเซอร์โคเนียม\nเซอะ\nเซา\nเซ้าซี้\nเซาะ\nเซิง\nเซิ้ง\nเซียน\nเซียบ\nเซียมซี\nเซียว\nเซี่ยว\nเซี่ยวกาง\nเซื่อง\nแซ\nแซ่\nแซ็กคาริน\nแซง\nแซงแซว\nแซด\nแซบ\nแซม\nแซยิด\nแซ่ว\nแซะ\nโซ\nโซ่\nโซก\nโซ่ง\nโซงโขดง\nโซเซ\nโซดา\nโซเดียม\nโซม\nโซรม\nโซลา\nไซ\nไซ้\nไซเกิล\nไซโคลน\nไซน์\nไซยาไนด์\nไซร้\nไซเรน\nไซโล\nฌาน\nฌาปน\nฌาปนกิจ\nฌาปนสถาน\nเฌอ\nเฌอเอม\nญวน\nญัตติ\nญาณ\nญาติ\nญานาซะฮ์\nญิบ\nญี่ปุ่น\nเญยธรรม\nไญยธรรม\nฎีกา\nฐกัด\nฐากูร\nฐาน\nฐานะ\nฐานันดร\nฐานานุกรม\nฐานานุรูป\nฐานานุศักดิ์\nฐานียะ\nฐาปน\nฐาปนา\nฐายี\nฐิต\nฐิติ\nฑาหก\nฑาหะ\nเฒ่า\nณรงค์\nเณร\nดก\nดง\nด้ง\nด้น\nดนโด่\nดนตรี\nดนัย\nดนุ\nดนู\nดบัสวิน\nดบัสวี\nดม\nดรงค์\nดรณี\nดรรชนี\nดราฟต์\nดรุณ\nดรุณี\nดล\nดวง\nด้วง\nดวด\nด่วน\nด้วน\nด้วย\nดอก\nดอง\nด่อง\nด้อง\nดองฉาย\nดองดึง\nดอด\nดอน\nด่อน\nดอม\nด้อม\nดอย\nด้อย\nดอลลาร์\nดะ\nดะโต๊ะ\nดะหมัง\nดัก\nดักดาน\nดักแด้\nดัง\nดั่ง\nดั้ง\nดัชนี\nดัด\nดัตช์\nดัน\nดั้น\nดับ\nดัมพ์\nดั้วเดี้ย\nดัสกร\nดา\nด่า\nดาก\nด่าง\nด้าง\nดาด\nดาน\nด่าน\nด้าน\nดาบ\nดาบส\nดาม\nด้าม\nด้ามจิ้ว\nดามพ์\nดาย\nด้าย\nดารกะ\nดารณี\nดารดาษ\nดาระ\nดารา\nดาล\nดาลัด\nดาว\nด่าว\nด้าว\nดาวดึงส์\nดาวบส\nดาษ\nดาษดา\nดำ\nด่ำ\nด้ำ\nดำกล\nดำเกิง\nดำแคง\nดำดง\nดำนาณ\nดำเนิน\nดำบล\nดำรง\nดำรวจ\nดำรัส\nดำริ\nดำรี\nดำรู\nดำฤษณา\nดำเลิง\nดิก\nดิ่ง\nดิฉัน\nดิฐ\nดิตถ์\nดิถี\nดิน\nดิ้น\nดิบ\nดิรัจฉาน\nดิลก\nดิ่ว\nดิ้ว\nดิ้วเดี้ยว\nดิษฐ์\nดิสโพรเซียม\nดี\nดีเซล\nดีด\nดีดีที\nดีบุก\nดีปลี\nดีเปรสชัน\nดีหมี\nดีหลี\nดึก\nดึง\nดึ่ง\nดึ่ม\nดื่น\nดื่ม\nดือ\nดื้อ\nดุ\nดุก\nดุกดิก\nดุกทะเล\nดุ้ง\nดุ้งดิ้ง\nดุจ\nดุด\nดุน\nดุ้น\nดุบ\nดุม\nดุ่ม\nดุ่ย\nดุรงค์\nดุริยะ\nดุริยางค์\nดุริยางคศาสตร์\nดุริยางคศิลป์\nดุล\nดุษฎี\nดุษณี\nดุษณีภาพ\nดุษิต\nดุสิต\nดุเหว่า\nดู\nดูกค่าง\nดูกร\nดูด\nดูรา\nดูแล\nเด\nเด่\nเดก\nเด็ก\nเดกซ์โทรส\nเดคากรัม\nเดคาเมตร\nเดคาลิตร\nเด้ง\nเด็จ\nเดช\nเดชน์\nเดชนะ\nเดชะ\nเดโช\nเดซิกรัม\nเดซิเมตร\nเดซิลิตร\nเด็ด\nเดน\nเด่น\nเดนมาร์ก\nเดรัจฉาน\nเด๋อ\nเด๋อด๋า\nเดา\nเด้า\nเดาะ\nเดิน\nเดิ่น\nเดิม\nเดียง\nเดียด\nเดียรดาษ\nเดียรถีย์\nเดียรัจฉาน\nเดียว\nเดี่ยว\nเดี๋ยว\nเดียะ\nเดื่อ\nเดือก\nเดื่อง\nเดือด\nเดือน\nเดือย\nแด\nแด่\nแดก\nแด็ก\nแดกงา\nแดกแด้\nแดง\nแดด\nแดน\nแด่น\nแด่ว\nแดะ\nแดะแด๋\nโด\nโด่\nโดกเดก\nโด่ง\nโดด\nโดน\nโดม\nโดมร\nโดย\nโดรณ\nใด\nได\nได้\nไดแซ็กคาไรด์\nไดนาโม\nไดนาไมต์\nไดโนเสาร์\nไดเรกตริกซ์\nตก\nต๊กโต\nตง\nต๋ง\nตงฉิน\nตงิด\nตงุ่น\nตด\nตติย\nตถาคต\nตน\nต้น\nตนัย\nตนุ\nตบ\nตบะ\nตปนียะ\nตม\nต้ม\nตมูก\nตยาคี\nตรง\nตรณี\nตรม\nตรรก\nตรรกะ\nตรลบ\nตรลอด\nตรลาด\nตรวจ\nตรวน\nตรอก\nตรอง\nตรอมใจ\nตรอมตรม\nตระ\nตระกล\nตระกวน\nตระกอง\nตระการ\nตระกูล\nตระคัร\nตระเตรียม\nตระนาว\nตระบก\nตระบอก\nตระบอง\nตระบัด\nตระบัน\nตระเบ็ง\nตระแบก\nตระแบง\nตระโบม\nตระพอง\nตระพัง\nตระลาการ\nตระวัน\nตระเว็ด\nตระเวน\nตระสัก\nตระหง่าน\nตระหนก\nตระหนัก\nตระหน่ำ\nตระหนี่\nตรัง\nตรังค์\nตรับ\nตรับฟัง\nตรัย\nตรัยตรึงศ์\nตรัส\nตรัสสา\nตรา\nตรากตรำ\nตราชู\nตราบ\nตราสัง\nตรำ\nตริ\nตริว\nตรี\nตรีปวาย\nตรีพิธพรรณ\nตรียัมปวาย\nตรึก\nตรึง\nตรุ\nตรุณ\nตรุณะ\nตรุษ\nตรู\nตรู่\nตฤณ\nตฤตีย\nตฤษณา\nตลก\nตลบ\nตลอด\nตลับ\nตลาด\nตลิ่ง\nตลึง\nตวง\nต่วน\nต้วมเตี้ยม\nตวัก\nตวัด\nตวาด\nตอ\nตอม่อ\nต่อ\nต้อ\nตอก\nต๊อก\nต๊อกต๋อย\nตอง\nต้อง\nตองกราย\nต้องเต\nตองแตก\nต่องแต่ง\nตองเหลือง\nตอด\nตอน\nต้อน\nตอบ\nตอเบา\nตอม\nต่อม\nต๋อม\nต่อย\nต้อย\nต้อยตริ่ง\nต้อยติ่ง\nต้อยตีวิด\nตอแย\nตอร์ปิโด\nต่อไส้\nตอแหล\nตะ\nตะกรน\nตะกร้อ\nตะกรัน\nตะกรับ\nตะกร้า\nตะกราม\nตะกรุด\nตะกรุม\nตะกรุมตะกราม\nตะกละ\nตะกลาม\nตะกวด\nตะกอ\nตะกอน\nตะกัง\nตะกั่ว\nตะกาง\nตะกาด\nตะกาย\nตะกาว\nตะกุกตะกัก\nตะกุย\nตะกู\nตะกูด\nตะเกียกตะกาย\nตะเกียง\nตะเกียบ\nตะแก\nตะแก่\nตะแกรง\nตะโก\nตะโก้\nตะโกก\nตะโกน\nตะโกรง\nตะโกรม\nตะไกร\nตะขบ\nตะขอ\nตะขาบ\nตะขิดตะขวง\nตะเข้\nตะเข็บ\nตะโขง\nตะคร้อ\nตะครอง\nตะครั่นตะครอ\nตะคร้ำ\nตะคริว\nตะคิว\nตะครุบ\nตะคอก\nตะคัน\nตะค้า\nตะคาก\nตะค้าน\nตะคุ่ม\nตะเครียว\nตะเคียว\nตะเคียน\nตะแคง\nตะไคร่\nตะไคร้\nตะเฆ่\nตะนอย\nตะนาว\nตะบม\nตะบอง\nตะบอย\nตะบัน\nตะบิ้ง\nตะบิด\nตะบิดตะบอย\nตะบี้ตะบัน\nตะบึง\nตะบูน\nตะเบ็ง\nตะเบ็งมาน\nตะเบ๊ะ\nตะแบก\nตะแบง\nตะโบม\nตะไบ\nตะปบ\nตะปลิง\nตะปิ้ง\nตะปุ่มตะป่ำ\nตะปู\nตะพง\nตะพด\nตะพอง\nตะพัก\nตะพัง\nตะพัด\nตะพั้น\nตะพาก\nตะพาน\nตะพาบ\nตะพาย\nตะพึด\nตะพึดตะพือ\nตะพุ่น\nตะเพรา\nตะเพิง\nตะเพิด\nตะเพียน\nตะโพก\nตะโพง\nตะโพน\nตะเภา\nตะใภ้\nตะม่อ\nตะมอย\nตะรังกะนู\nตะรังตังกวาง\nตะรังตังช้าง\nตะราง\nตะลอง\nตะลอน\nตะล่อม\nตะละ\nตะลาน\nตะลิงปลิง\nตะลิบ\nตะลีตะลาน\nตะลึง\nตะลึงพรึงเพริด\nตะลุง\nตะลุ่ม\nตะลุ่มนก\nตะลุมบอน\nตะลุ่มโปง\nตะลุมพอ\nตะลุมพุก\nตะลุย\nตะเลง\nตะแลงแกง\nตะไล\nตะวัน\nตะเวน\nตะหลิว\nตะหลุก\nตะหลุง\nตะแหง่ว\nตะแหมะแขะ\nตะโหงก\nตัก\nตักกะ\nตักเตือน\nตั๊กแตน\nตักษัย\nตัง\nตั่ง\nตั้ง\nตังเก\nตังฉ่าย\nตังเม\nตังวาย\nตังโอ๋\nตัจฉก\nตัจฉนี\nตัณฑุล\nตัณหา\nตัด\nตัน\nตันตระ\nตันติ\nตันหยง\nตับ\nตับปิ้ง\nตัว\nตั๋ว\nตัวจี๊ด\nตัวตืด\nตั้วโผ\nตั้วเหี่ย\nตา\nตาก\nตากวาง\nต่าง\nตาด\nตาน\nต่าน\nต้าน\nตานนกกด\nตานี\nตาบ\nตาม\nตามะแน\nตามิน\nตาย\nตาราไต\nตาล\nตาลุ\nต๋าว\nตาเสือ\nตาหนู\nตาฬ\nตำ\nต่ำ\nตำนาน\nตำบล\nตำแบ\nตำแย\nตำรวจ\nตำรับ\nตำรา\nตำรุ\nตำลึง\nตำเสา\nตำหนัก\nตำหนิ\nตำแหน่ง\nติ\nติก\nติ๊ก\nติกะ\nติกาหรัง\nติง\nติ่ง\nติ๋ง\nติ่งตั่ง\nติ๋งต่าง\nติงส\nติงสติ\nติณ\nติด\nติตติกะ\nติตติร\nติตถ\nติตถะ\nติถี\nติมิงคละ\nติรัจฉาน\nติลก\nติละ\nติ้ว\nตี\nตี่\nตีน\nตีบ\nตีรถะ\nตีระ\nตึ\nตึก\nตึ้ก\nตึ้กตั้ก\nตึง\nตึดตื๋อ\nตึ๊ดตื๋อ\nตืด\nตื่น\nตื้น\nตื้อ\nตื๊อ\nตื๋อ\nตุ\nตุ๊\nตุ๊กแก\nตุ๊กตา\nตุ๊กต่ำ\nตุกติก\nตุ๊กติ๊ก\nตุ๊กตุ่น\nตุ๊กตุ๋ย\nตุง\nตุ้งก่า\nตุ้งติ้ง\nตุ๊ดตู่\nตุน\nตุ่น\nตุ๋น\nตุนาหงัน\nตุบ\nตุ้บ\nตุ๊บป่อง\nตุปัดตุป่อง\nตุปัดตุเป๋\nตุ่ม\nตุ้ม\nตุ๋ม\nตุ้มกว้าว\nตุมกา\nตุ้มแซะ\nตุมตัง\nตุ้มเต๋น\nตุ้มปี่\nตุมพะ\nตุ่ย\nตุ้ย\nตุ๊ย\nตุ๋ยตุ่ย\nตุรคะ\nตุรงค์\nตุล\nตุลา\nตุหรัดตุเหร่\nตู\nตู่\nตู้\nตูก\nตูด\nตูบ\nตูม\nเต๊ก\nเต็ง\nเต่ง\nเตช\nเตโช\nเต้น\nเต็นท์\nเต็ม\nเตย\nเตร่\nเตร็ด\nเตรตา\nเตรียม\nเตรียมตรม\nเตละ\nเตลิด\nเตว็ด\nเต่อ\nเตอะ\nเตะ\nเตา\nเต่า\nเต้า\nเต๋า\nเต่าเกียด\nเต้าเจี้ยว\nเต้าทึง\nเต้าหู้\nเต้าฮวย\nเต๊าะ\nเตาะแตะ\nเติ่ง\nเติบ\nเติม\nเตี้ย\nเตียง\nเตียน\nเตียบ\nเตี๋ยม\nเตียรถ์\nเตียว\nเตี่ยว\nเตือน\nแต่\nแต้\nแตก\nแตง\nแต่ง\nแตงเม\nแต้จิ๋ว\nแตด\nแต๊ดแต๋\nแตน\nแต้ม\nแตร\nแตระ\nแต้ว\nแต้วแร้ว\nแต้วแล้ว\nแต่ว่า\nแต้แว้ด\nแตะ\nโต\nโต้\nโตก\nโต่ง\nโต้ง\nโตงเตง\nโตฎก\nโต๊ด\nโตน\nโตนด\nโต้โผ\nโตมร\nโตย\nโตรก\nโต๊ะ\nใต้\nไต\nไต่\nไต้\nไต๋\nไตร\nไตรกิศยา\nไตรดายุค\nไตร่ตรอง\nไตรย\nไต้หวัน\nถก\nถกล\nถงาด\nถด\nถนน\nถนอม\nถนัด\nถนัน\nถนำ\nถนิม\nถม\nถ่ม\nถมอ\nถมึงทึง\nถลก\nถลกบาตร\nถลน\nถล่ม\nถลอก\nถลัน\nถลา\nถลาก\nถลาย\nถลำ\nถลึงตา\nถลุง\nถ่วง\nถ้วน\nถ้วย\nถวัล\nถวัลย์\nถวาย\nถวิน\nถวิล\nถ่อ\nถ้อ\nถอก\nถอง\nถ่อง\nถ้อง\nถอด\nถอน\nถอบ\nถอบแถบ\nถ่อม\nถอย\nถ่อย\nถ้อย\nถะ\nถะถั่น\nถะมัดถะแมง\nถัก\nถัง\nถั่ง\nถัด\nถัทธ\nถัน\nถั่น\nถับ\nถัมภ์\nถัว\nถั่ว\nถา\nถ้า\nถาก\nถาง\nถ่าง\nถาด\nถาน\nถ่าน\nถาบ\nถาม\nถามะ\nถ่าย\nถ่าว\nถาวร\nถาวรธิรา\nถ้ำ\nถิ่น\nถี่\nถีบ\nถึก\nถึง\nถือ\nถุง\nถุน\nถุย\nถู\nถูก\nเถกิง\nเถน\nเถร\nเถระ\nเถรานุเถระ\nเถรี\nเถลไถล\nเถลิง\nเถลือกถลน\nเถ่อ\nเถอะ\nเถา\nเถ้า\nเถาวัลย์\nเถาะ\nเถิก\nเถิง\nเถิด\nเถิดเทิง\nเถิน\nเถียง\nเถียร\nเถือ\nเถือก\nเถื่อน\nแถ\nแถก\nแถง\nแถน\nแถบ\nแถม\nแถลง\nแถลบ\nแถว\nโถ\nโถง\nโถงเถง\nโถบ\nโถม\nโถมนาการ\nไถ\nไถ่\nไถ้\nไถง\nไถล\nทกล้า\nทแกล้ว\nท่ง\nทด\nทน\nท้น\nทนต์\nทนโท่\nทนาย\nทบ\nทบวง\nทมอ\nทมะ\nทมิฬ\nทโมน\nทยอย\nทแยง\nทรกรรม\nทรชน\nทรชาติ\nทรพิษ\nทรยศ\nทรราช\nทรลักษณ์\nทรง\nทรพี\nทรมาทรกรรม\nทรมาน\nทรรทึง\nทรรป\nทรรปณ์\nทรรปณะ\nทรรศนะ\nทรรศนาการ\nทรรศนีย์\nทรวง\nทรวดทรง\nทรวาร\nทรหด\nทรหวล\nทรหึง\nทรอมโบน\nทระนง\nทรัพย์\nทรัพยากร\nทรัมเป็ต\nทรานซิสเตอร์\nทราบ\nทราม\nทราย\nทรุด\nทฤษฎี\nทลาย\nทลิท\nทลิททก\nทวง\nท้วง\nท่วงท่า\nท่วงทำนอง\nท่วงที\nทวด\nทวน\nท้วน\nท่วม\nท้วม\nทวย\nท่วย\nท้วย\nทวอย\nทวัตดึงส์\nทวัย\nทวา\nทวาบร\nทว่า\nทวาย\nทวาร\nทวิ\nทวิช\nทวิตีย์\nทวิตียา\nทวี\nทวีธาภิเษก\nทวีป\nทศ\nทศมี\nทศางค์\nทหระ\nทหาร\nทอ\nท่อ\nท้อ\nทอก\nทอง\nท่อง\nท้อง\nทองกวาว\nทองภู\nทองลิน\nทองหลาง\nทองโหลง\nทองอุไร\nทอด\nทอน\nท่อน\nทอนซิล\nทอฟฟี่\nท่อม\nทอย\nทอเรียม\nทะ\nทะงัน\nทะนง\nทะนน\nทะนาน\nทะนุ\nทะเบียน\nทะมัดทะแมง\nทะมึน\nทะมื่น\nทะแม่ง\nทะยาน\nทะเยอทะยาน\nทะแย\nทะร่อทะแร่\nทะลวง\nทะลอก\nทะลัก\nทะลาย\nทะลึ่ง\nทะลุ\nทะลุดทะลาด\nทะเล\nทะเล้น\nทะเล่อทะล่า\nทะเลาะ\nทะเลิ่กทะลั่ก\nทะวาย\nทัก\nทักข์\nทักขิญ\nทักขิณ\nทักขิณา\nทักขิณาวัฏ\nทักขิโณทก\nทักขิไณยบุคคล\nทักทิน\nทักษะ\nทักษา\nทักษิณ\nทักษิณา\nทักษิโณทก\nทัง\nทั่ง\nทั้ง\nทังวล\nทังวี้ทังวล\nทังสเตน\nทัณฑ์\nทัณฑกรรม\nทัณฑฆาต\nทัณฑสถาน\nทัณฑะ\nทัณฑิกา\nทัณฑิมา\nทัณฑีบท\nทัด\nทัดทา\nทัต\nทัน\nทันต์\nทันตชะ\nทันตแพทย์\nทันติน\nทันตี\nทันธ์\nทับ\nทับทิม\nทับสมิงคลา\nทัพ\nทัพพะ\nทัพพี\nทั่ว\nทัศ\nทัศน์\nทัศนะ\nทัศนา\nทัศนคติ\nทัศนวิสัย\nทัศนศาสตร์\nทัศนศิลป์\nทัศนศึกษา\nทัศนาการ\nทัศนาจร\nทัศนีย์\nทัศนียภาพ\nทัศนูปกรณ์\nทัศไนย\nทัสนานุตริยะ\nทัฬหะ\nทัฬหิ\nทัฬหี\nทา\nท่า\nท้า\nทาก\nทาง\nท้าง\nทาฐะ\nทาฐิกะ\nทาฒะ\nทาฒิกะ\nทาน\nท่าน\nทานต์\nทานพ\nทาบ\nทาม\nท่ามกลาง\nทาย\nท้าย\nทายก\nทายัช\nทายาด\nทายาท\nทายิกา\nทารก\nทารพี\nทาริกา\nทารุณ\nทาว\nท่าว\nท้าว\nทาส\nทาสี\nทำ\nทำนบ\nทำนอง\nทำนาย\nทำนุ\nทำนูล\nทำเนา\nทำเนียบ\nทำไม\nทำลาย\nทำเล\nทิคัมพร\nทิฆัมพร\nทิ้ง\nทิงเจอร์\nทิ้งถ่อน\nทิ้งทูด\nทิชะ\nทิชากร\nทิชาชาติ\nทิฏฐะ\nทิฏฐานุคติ\nทิฏฐุชุกรรม\nทิฐธรรม\nทิฐิ\nทิด\nทิต\nทิน\nทิพ\nทิพย์\nทิพา\nทิม\nทิ่ม\nทิมทอง\nทิว\nทิวงคต\nทิวทัศน์\nทิวา\nทิศ\nทิศา\nทิศานุทิศ\nที\nที่\nทีฆชาติ\nทีฆนิกาย\nทีฆสระ\nทีฆายุ\nทีป\nทีม\nทีเอ็นที\nทึก\nทึกทัก\nทึ่ง\nทึ้ง\nทึดทือ\nทึนทึก\nทึบ\nทึม\nทึ่ม\nทื่อ\nทุ\nทุก\nทุกข์\nทุกขลาภ\nทุกขเวทนา\nทุกขารมณ์\nทุกฏ\nทุกรกิริยา\nทุกะ\nทุกัง\nทุกูล\nทุคตะ\nทุคติ\nทุ่ง\nทุ้ง\nทุงงะ\nทุจริต\nทุด\nทุทรรศนนิยม\nทุนิยม\nทุน\nทุ่น\nทุนนิมิต\nทุบ\nทุบทู\nทุปปัญญา\nทุพพรรณ\nทุพพล\nทุพพลภาพ\nทุพภิกขภัย\nทุม\nทุ่ม\nทุ้ม\nทุย\nทุ้ย\nทุรกันดาร\nทุรชน\nทุรชาติ\nทุรพล\nทุรลักษณ์\nทุรน\nทุรนทุราย\nทุรัศ\nทุราคม\nทุราจาร\nทุเรศ\nทุเรียน\nทุลักทุเล\nทุเลา\nทุศีล\nทุสสะ\nทุสสีล\nทู\nทูโม่ง\nทู่\nทู้\nทูกัง\nทู่ซี้\nทูต\nทูตานุทูต\nทูน\nทูบ\nทูม\nทูล\nทูเลียม\nเท\nเท่\nเทคนิค\nเทคนีเชียม\nเทคโนโลยี\nเท้ง\nเท้งเต้ง\nเท็จ\nเทนนิส\nเทพ\nเทพา\nเทพารักษ์\nเทพยเจ้า\nเทพยดา\nเทพยุดา\nเทพิน\nเทพินทร์\nเทพี\nเทเพนทร์\nเทโพ\nเทริด\nเทลลูเรียม\nเทวทัณฑ์\nเทวดา\nเทวทูต\nเทวธรรม\nเทวนาครี\nเทวนิยม\nเทวรูป\nเทวโลก\nเทววิทยา\nเทวสถาน\nเทวศ\nเทวษ\nเทวัญ\nเทวัน\nเทวาลัย\nเทวินทร์\nเทวี\nเทเวศ\nเทเวศร์\nเทเวศวร์\nเทศ\nเทศะ\nเทศาภิบาล\nเทศน์\nเทศนา\nเทห์\nเท่ห์\nเทห์ฟากฟ้า\nเทหวัตถุ\nเท่อ\nเท้อ\nเทอญ\nเทอม\nเทอร์เบียม\nเทอร์โมมิเตอร์\nเทอะทะ\nเทา\nเท่า\nเท้า\nเท้ายายม่อม\nเท่ารึง\nเทิ่ง\nเทิด\nเทิน\nเทิบ\nเทิบทาบ\nเทิ้ม\nเที่ยง\nเทียด\nเทียน\nเที้ยน\nเทียบ\nเทียม\nเทียร\nเที้ยร\nเทียว\nเที่ยว\nเทือ\nเทื่อ\nเทื้อ\nเทือก\nแท้\nแท็กซี่\nแทง\nแท่ง\nแท้ง\nแท็งก์\nแทงทวย\nแทงวิสัย\nแทตย์\nแทน\nแท่น\nแทนเจนต์\nแทนทาลัม\nแทบ\nแทรก\nแทรกเตอร์\nแทลเลียม\nแทะ\nโท\nโท่\nโทกเทก\nโทง\nโทงเทง\nโทณะ\nโทน\nโทนโท่\nโทมนัส\nโทรคมนาคม\nโทรทรรศน์\nโทรทัศน์\nโทรพิมพ์\nโทรภาพ\nโทรเลข\nโทรศัพท์\nโทรสาร\nโทรม\nโทษ\nโทษา\nโทษานุโทษ\nโทสะ\nโทสาคติ\nโทโส\nโทหฬินี\nไท\nไท้\nไทเทเนียม\nไทเทรต\nไทย\nไทร\nไทวะ\nธง\nธงก์\nธชะ\nธชี\nธตรฐ\nธนบัตร\nธนสมบัติ\nธนสาร\nธนะ\nธนา\nธนาคม\nธนาคาร\nธนาณัติ\nธเนศ\nธโนปจัย\nธไนศวรรย์\nธนิต\nธนิษฐะ\nธนิษฐา\nธนุ\nธนุรวิทยา\nธนุรเวท\nธนู\nธม\nธมกรก\nธรณะ\nธรณิน\nธรณินทร์\nธรณิศ\nธรณิศร\nธรณิศวร์\nธรณี\nธรมาน\nธรรม\nธรรมนูญ\nธรรมยุต\nธรรมยุติกนิกาย\nธรรมะ\nธรรมาทิตย์\nธรรมาธรรม\nธรรมาธิปไตย\nธรรมาธิษฐาน\nธรรมานุสาร\nธรรมาภิมุข\nธรรมาภิสมัย\nธรรมายตนะ\nธรรมารมณ์\nธรรมาสน์\nธรรมิก\nธรา\nธราดล\nธราธร\nธราธาร\nธราธิบดี\nธราธิป\nธริษตรี\nธเรษตรี\nธเรศ\nธวัช\nธัช\nธัญ\nธัญญาหาร\nธันยา\nธันยาวาท\nธันวาคม\nธัมมะ\nธาดา\nธาตรี\nธาตวากร\nธาตุ\nธาตุโขภ\nธาตุมมิสสา\nธานิน\nธานินทร์\nธานี\nธาร\nธารกำนัล\nธารคำนัล\nธารณะ\nธารณา\nธารา\nธาษตรี\nธำมรงค์\nธำรง\nธิดา\nธิติ\nธีระ\nธุช\nธุดงค์\nธุดงควัตร\nธุต\nธุตตะ\nธุมเกตุ\nธุมา\nธุรการ\nธุรกิจ\nธุระ\nธุรำ\nธุลี\nธุวดารา\nธุวภาค\nธุวมณฑล\nธูป\nเธนุ\nเธอ\nเธียร\nโธ่\nโธวนะ\nนก\nนกุล\nนขลิขิต\nนขะ\nนขา\nนเคนทร์\nนโคทร\nนคร\nนครินทร์\nนคเรศ\nนง\nนงคุฐ\nนที\nนนตรา\nนนท์\nนันทน์\nนนทรี\nนนทลี\nนนทิ\nนบ\nนปุงสกลิงค์\nนปุงสกลึงค์\nนพ\nนพนิต\nนภจร\nนภดล\nนภศูล\nนภา\nนภาลัย\nนม\nนมตำเรีย\nนมตำเลีย\nนมะ\nนมัสการ\nนมาซ\nนยนะ\nนยนา\nนโยบาย\nนรชาติ\nนรเทพ\nนรนาถ\nนรบดี\nนรบาล\nนรสิงห์\nนรสีห์\nนรา\nนรากร\nนราธิป\nนรินทร์\nนริศ\nนริศร\nนริศวร\nนเรศ\nนเรศวร\nนเรศวร์\nนโรดม\nนรก\nนรกานต์\nนรการ\nนรี\nนฤเทพ\nนฤบดี\nนฤบาล\nนฤเบศ\nนฤปเวศม์\nนฤปัตนี\nนฤคหิต\nนฤนาท\nนฤมล\nนฤตย์\nนฤตยสถาน\nนฤพาน\nนฤมาณ\nนฤมิต\nนลาฏ\nนลิน\nนลินี\nนวกรรม\nนวการ\nนวกิจ\nนวนิยาย\nนวปฎล\nนวรัตน์\nนวโลหะ\nนวกะ\nนวโกวาท\nนวด\nนวม\nน่วม\nนวมี\nนวย\nนวล\nนวัตกรรม\nนวาระ\nนหารุ\nนหุต\nนฬการ\nนอ\nนอก\nนอง\nน่อง\nน้อง\nน่องแน่ง\nนอต\nนอน\nนอบ\nน้อม\nน้อย\nน้อยหน่า\nน้อยโหน่ง\nนะ\nนะแน่ง\nนัก\nนักขัต\nนักขัตฤกษ์\nนักษัตร\nนักสราช\nนัข\nนั่ง\nนังคัล\nนัจ\nนัฑ\nนัด\nนัดดา\nนัตถุ์\nนั่น\nนั้น\nนันททายี\nนันทนาการ\nนันทวัน\nนันทิ\nนับ\nนัย\nนัยน์\nนัยนา\nนัว\nนัวเนีย\nนา\nน่า\nน้า\nนาก\nนากบุด\nนากาสาหรี\nนาค\nนาคร\nนาคา\nนาคาวโลก\nนาคินทร์\nนาคี\nนาเคนทร์\nนาเคศวร\nนาง\nนางเกล็ด\nนางนวล\nนางนูน\nนางรม\nนางรำ\nนางล้อม\nนางเล็ด\nนางเลิ้ง\nนางหงส์\nนางอาย\nนางแอ่น\nนาฏ\nนาฏกะ\nนาด\nนาถ\nนาท\nนาที\nนาน\nน่าน\nนานัครส\nนานัปการ\nนานา\nนาเนก\nนาบ\nนาภี\nนาม\nนามานุกรม\nนามาภิไธย\nนาย\nน่าย\nนายก\nนายิกา\nนารา\nนารายณ์\nนารี\nนาเรศ\nนาลิวัน\nนาว\nน้าว\nนาวา\nนาวิก\nนาวิน\nนาวี\nนาเวศ\nนาศ\nนาสา\nนาสิก\nนาฬิกา\nนาฬิเก\nนาฬี\nนำ\nน้ำ\nน้ำละว้า\nน้ำว้า\nนิ\nนิกเกิล\nนิกขะ\nนิกร\nนิกรอยด์\nนิกาย\nนิคม\nนิครนถ์\nนิคหกรรม\nนิคหะ\nนิคหิต\nนิคาลัย\nนิเคราะห์\nนิโคติน\nนิโครธ\nนิโครม\nนิ่ง\nนิจ\nนิด\nนิตย์\nนิตยทาน\nนิตยภัต\nนิตยสาร\nนิติ\nนิทร\nนิทรรศการ\nนิทรา\nนิทรารมณ์\nนิทัศน์\nนิทาน\nนิเทศ\nนิธาน\nนิธิ\nนินทา\nนินนาท\nนินาท\nนิบาต\nนิปริยาย\nนิปัจการ\nนิพจน์\nนิพนธ์\nนิพพาน\nนิพพิทา\nนิพัทธ์\nนิพันธ์\nนิพิท\nนิเพท\nนิภา\nนิ่ม\nนิ้ม\nนิมนต์\nนิมมาน\nนิมมานรดี\nนิมิต\nนิยต\nนิยม\nนิยัตินิยม\nนิยาม\nนิยาย\nนิยุต\nนิรคุณ\nนิรชร\nนิรชรา\nนิรทุกข์\nนิรเทศ\nนิรโทษ\nนิรโทษกรรม\nนิรนัย\nนิรนาม\nนิรภัย\nนิรมล\nนิรมาน\nนิรัตศัย\nนิรันดร\nนิราพาธ\nนิรามัย\nนิรามิษ\nนิราศรัย\nนิรินธน์\nนิรมาณ\nนิรมิต\nนิรยบาล\nนิรัพพุท\nนิรา\nนิราศ\nนิรุกติ\nนิรุตติ\nนิรุทธ์\nนิโรธ\nนิล\nนิลุบล\nนิโลบล\nนิ่ว\nนิ้ว\nนิวคลิอิก\nนิวเคลียร์\nนิวเคลียส\nนิวตรอน\nนิวรณ์\nนิวัต\nนิวัตน์\nนิวาต\nนิวาส\nนิเวศ\nนิเวศน์\nนิศา\nนิษาท\nนิสัช\nนิสัชชาการ\nนิสัย\nนิสาท\nนิสิต\nนิสีทนสันถัต\nนิสีทนะ\nนิสีทนาการ\nนิเสธ\nนี่\nนี้\nนี่นัน\nนีรนาท\nนีออน\nนีโอดิเมียม\nนึก\nนึง\nนึ่ง\nนุง\nนุ่ง\nนุงนัง\nนุช\nนุต\nนุ่น\nนุ่ม\nนุ้ย\nนูน\nนู่น\nนู้น\nเนกขัม\nเนตบอล\nเนตร\nเนติ\nเน้น\nเนบิวลา\nเนปจูน\nเนปทูเนียม\nเนมิ\nเนย\nเนรกัณฐี\nเนรคุณ\nเนรเทศ\nเนรนาด\nเนรมิต\nเนระพูสี\nเนอ\nเน้อ\nเนา\nเน่า\nเนาวนิต\nเนาวรัตน์\nเนิน\nเนิ่น\nเนิบ\nเนียง\nเนียน\nเนียม\nเนียร\nเนียรทุกข์\nเนียรเทศ\nเนียรนาท\nเนื้อ\nเนือง\nเนื่อง\nเนือย\nแน่\nแน่ง\nแน่น\nแนบ\nแน่บ\nแนม\nแนว\nแน่ว\nแนะ\nแน่ะ\nแนะแหน\nโน\nโน้ต\nโนน\nโน่น\nโน้น\nโนเบเลียม\nโน้ม\nโนมพรรณ\nโนรา\nโนรี\nใน\nไน\nไนต์คลับ\nไนโตรเจน\nไนลอน\nไนโอเบียม\nบ่\nบก\nบง\nบ่ง\nบงก์\nบ๊งเบ๊ง\nบงสุ์\nบด\nบดินทร์\nบดี\nบถ\nบท\nบน\nบ่น\nบพิตร\nบพิธ\nบ่ม\nบรม\nบรมัตถ์\nบรรจง\nบรรจถรณ์\nบรรจบ\nบรรจวบ\nบรรจุ\nบรรเจิด\nบรรณ\nบรรดา\nบรรตานึก\nบรรถร\nบรรทม\nบรรทัด\nบรรทาน\nบรรทุก\nบรรเทา\nบรรเทือง\nบรรพ\nบรรพ์\nบรรพชา\nบรรพชิต\nบรรพต\nบรรยง\nบรรยงก์\nบรรยเวกษก์\nบรรยากาศ\nบรรยาย\nบรรลัย\nบรรลาย\nบรรลุ\nบรรเลง\nบรรโลม\nบรรษัท\nบรรสบ\nบรรสพ\nบรรสม\nบรรสาน\nบรรสาร\nบรรหาน\nบรรหาร\nบรอนซ์\nบรั่นดี\nบรัศว์\nบราลี\nบริกรรม\nบริการ\nบริขาร\nบริขารโจล\nบริคณห์\nบริจาค\nบริจาริกา\nบริเฉท\nบริชน\nบริดจ์\nบริบท\nบริบาล\nบริบูรณ์\nบริพนธ์\nบริพัตร\nบริพันธ์\nบริพาชก\nบริพาร\nบริภัณฑ์\nบริภาษ\nบริโภค\nบริมาส\nบริยาย\nบริรม\nบริรักษ์\nบริราช\nบริวรรต\nบริวาร\nบริวาส\nบริเวณ\nบริษัท\nบริสชน\nบริสุทธิ์\nบริหาร\nบล็อก\nบวก\nบวง\nบ่วง\nบวช\nบวน\nบ้วน\nบวบ\nบวม\nบ๊วย\nบวร\nบหลิ่ม\nบอ\nบ่อ\nบ้อ\nบอก\nบอง\nบ่อง\nบ้อง\nบ๊อง\nบ้องแบ๊ว\nบองหลา\nบอด\nบอน\nบ่อน\nบอบ\nบ้อม\nบ๋อม\nบ่อย\nบอระเพ็ด\nบอล\nบอลลูน\nบ้อหุ้น\nบ๊ะ\nบ๊ะจ่าง\nบะหมี่\nบัก\nบักโกรก\nบัคเตรี\nบัง\nบั้ง\nบังกะโล\nบังเกิด\nบังคน\nบังคม\nบังคล\nบังควร\nบังคับ\nบังคัล\nบังแทรก\nบังวาย\nบังเวียน\nบังสุกุล\nบังสุกูลิก\nบังสูรย์\nบังหวน\nบังเหตุ\nบังเหียน\nบังอร\nบังอวจ\nบังอาจ\nบังเอิญ\nบัญจก\nบัญชร\nบัญชา\nบัญชี\nบัญญัติ\nบัญหา\nบัฏ\nบัณฑร\nบัณฑิต\nบัณฑิตย์\nบัณฑุ\nบัณฑูร\nบัณเฑาะก์\nบัณเฑาะว์\nบัณณาส\nบัณรส\nบัณรสี\nบัด\nบัดกรี\nบัดซบ\nบัดสี\nบัตร\nบัทม์\nบัน\nบั่น\nบั้น\nบันจวบ\nบันดล\nบันดาล\nบันได\nบันทึก\nบันทึง\nบันเทิง\nบันยะบันยัง\nบันลือ\nบัปผาสะ\nบัพ\nบัพชา\nบัพพาชนียกรรม\nบัล\nบัลลพ\nบัลลังก์\nบัลลูน\nบัลเลต์\nบัว\nบา\nบ่า\nบ้า\nบาก\nบาง\nบ่าง\nบ้าง\nบาจรีย์\nบาซิลลัส\nบาด\nบาดทะจิต\nบาดทะพิษ\nบาดทะยัก\nบาดาล\nบาตร\nบาท\nบาทสกุณี\nบาทหลวง\nบาทาธึก\nบาทุกา\nบาน\nบ้าน\nบานชื่น\nบานเช้า\nบานบุรี\nบานไม่รู้โรย\nบานเย็น\nบ้าบ๋า\nบาป\nบาย\nบ่าย\nบ้าย\nบาร์\nบารนี\nบารมี\nบาร์เรล\nบาร์เลย์\nบารอมิเตอร์\nบ้าระบุ่น\nบาเรียน\nบาเรียม\nบาล\nบาลี\nบ่าว\nบ่าวขุน\nบาศ\nบาศก์\nบาสเกตบอล\nบาหลี\nบ๋ำ\nบำเทิง\nบำนาญ\nบำบวง\nบำบัด\nบำเพ็ญ\nบำราบ\nบำราศ\nบำรุง\nบำรู\nบำเรอ\nบำหยัด\nบำเหน็จ\nบิ\nบิฐ\nบิณฑบาต\nบิด\nบิดร\nบิดหล่า\nบิดา\nบิตุ\nบิตุจฉา\nบิตุรงค์\nบิตุเรศ\nบิตุลา\nบิน\nบิ่น\nบินยา\nบิลเลียด\nบิวเรตต์\nบิสมัท\nบี้\nบีฑา\nบีตา\nบีบ\nบีเยศ\nบึก\nบึกบึน\nบึง\nบึ่ง\nบึ้ง\nบุ\nบุก\nบุคคล\nบุคลากร\nบุคลาธิษฐาน\nบุคลิก\nบุง\nบุ่ง\nบุ้ง\nบุ้งกี๋\nบุญ\nบุญญาธิการ\nบุญญานุภาพ\nบุญญาภินิหาร\nบุญญาภิสังขาร\nบุณฑริก\nบุณมี\nบุณย์\nบุตร\nบุตรี\nบุถุชน\nบุทคล\nบุนนาค\nบุบ\nบุบบิบ\nบุปผชาติ\nบุพกรรม\nบุพการี\nบุพกิจ\nบุพชาติ\nบุพทักษิณ\nบุพนิมิต\nบุพบท\nบุพพาจารย์\nบุพเพสันนิวาส\nบุพโพ\nบุ๋ม\nบุ่มบ่าม\nบุ้ย\nบุรณะ\nบุรพทิศ\nบุรพบท\nบุรพาจารย์\nบูรพาจารย์\nบุระ\nบุราณ\nบุรินทร์\nบุริมทิศ\nบุริมพรรษา\nบุริมสิทธิ\nบุรี\nบุรุษ\nบุโรทั่ง\nบุษกร\nบุษบก\nบุษบง\nบุษบัน\nบุษบา\nบุษบามินตรา\nบุษปราค\nบุษปะ\nบุษย์\nบุษย์น้ำทอง\nบุษยมาส\nบุษยะ\nบุษราคัม\nบุหงัน\nบุหงา\nบุหรง\nบุหรี่\nบุหลัน\nบู่\nบู้\nบูชนียสถาน\nบูชา\nบูชิต\nบูด\nบูดู\nบูร\nบูรณ์\nบูรณภาพ\nบูรณมี\nบูรณะ\nบูรณาการ\nบูรพ์\nบูรพะ\nบูรพา\nเบ้\nเบ่ง\nเบ๊จี๋\nเบญกานี\nเบญจกัลยาณี\nเบญจกามคุณ\nเบญจขันธ์\nเบญจดุริยางค์\nเบญจธรรม\nเบญจบรรพต\nเบญจพรรณ\nเบญจเพส\nเบญจมาศ\nเบญจรงค์\nเบญจศก\nเบญจศีล\nเบญจะ\nเบญจา\nเบญจางค์\nเบญจางคประดิษฐ์\nเบญญา\nเบญพาด\nเบ็ด\nเบ็ดเตล็ด\nเบ็ดเสร็จ\nเบน\nเบนซิน\nเบรก\nเบริลเลียม\nเบส\nเบ้อ\nเบอร์\nเบอร์คีเลียม\nเบ้อเร่อ\nเบ้อเร่อเท่อ\nเบ้อเริ่ม\nเบ้อเริ่มเทิ่ม\nเบอะ\nเบอะบะ\nเบะ\nเบา\nเบ้า\nเบาราณ\nเบาะ\nเบิก\nเบิ่ง\nเบี้ย\nเบี่ยง\nเบียด\nเบียน\nเบียร์\nเบี้ยว\nเบือ\nเบื่อ\nเบื้อ\nเบื้อง\nเบือน\nแบ\nแบ้\nแบก\nแบคทีเรีย\nแบ่ง\nแบงก์\nแบดมินตัน\nแบตเตอรี่\nแบน\nแบนโจ\nแบบ\nแบ็บ\nแบเรียม\nแบหลา\nแบะ\nโบ\nโบ้\nโบ๋\nโบก\nโบกขรณี\nโบกขรพรรษ\nโบชุก\nโบต\nโบนัส\nโบ๊เบ๊\nโบย\nโบรมีน\nโบรอน\nโบราณ\nโบสถ์\nใบ\nใบ้\nไบ่\nปก\nปกติ\nปกรณ์\nปกรณัม\nปกิณกะ\nปกีรณัม\nปโกฏิ\nปง\nป่ง\nปงช้าง\nปฎล\nปฏัก\nปฏิกรณ์\nปฏิกรรม\nปฏิการะ\nปฏิกิริยา\nปฏิกูล\nปฏิคม\nปฏิคหิต\nปฏิคาหก\nปฏิฆะ\nปฏิชีวนะ\nปฏิญญา\nปฏิญาณ\nปฏิทิน\nปฏิบถ\nปฏิบัติ\nปฏิปทา\nปฏิปักษ์\nปฏิปัน\nปฏิปุจฉาพยากรณ์\nปฏิปุจฉาวาที\nปฏิพัทธ์\nปฏิพากย์\nปฏิภาค\nปฏิภาณ\nปฏิมา\nปฏิมากร\nปฏิยุทธ์\nปฏิรพ\nปฏิรูป\nปฏิโลม\nปฏิวัติ\nปฏิวาต\nปฏิวาท\nปฏิเวธ\nปฏิสนธิ\nปฏิสวะ\nปฏิสังขรณ์\nปฏิสันถาร\nปฏิสัมภิทา\nปฏิเสธ\nปฐพี\nปฐม\nปฐวี\nปณต\nปณาม\nปณิธาน\nปณิธิ\nปณีต\nปด\nปดิวรัดา\nปติ\nปถพี\nปถมัง\nปถวี\nปทัฏฐาน\nปทัสถาน\nปทานุกรม\nปทีป\nปทุม\nปน\nป่น\nปนัดดา\nปปัญจะ\nปม\nปรนัย\nปรปักษ์\nปรโลก\nปรวาที\nปรก\nปรกติ\nปรง\nปรตยักษ์\nปรน\nปรนนิบัติ\nปรนิมมิตวสวัตดี\nปรบ\nปรปักษ์\nปรมัตถ์\nปรมาจารย์\nปรมาณู\nปรมาภิไธย\nปรมาภิเษก\nปรมินทร์\nบรเมนทร์\nปรเมศวร์\nปรเมษฐ์\nปรวด\nปรวนแปร\nปรศุ\nปรสิต\nปร๋อ\nปรองดอง\nปรอด\nปรอท\nปรอย\nประ\nประกบ\nประกฤต\nประกฤติ\nประกล\nประกวด\nประกวดประขัน\nประกอบ\nประกัน\nประกับ\nประกาย\nประกายพรึก\nประการ\nประกาศ\nประกาศนียบัตร\nประกาศิต\nประกำ\nประกิด\nประกิต\nประคด\nประคนธรรพ\nประคนธรรพ์\nประคบ\nประคบประหงม\nประคอง\nประคับประคอง\nประคัลภ์\nประคำ\nประคิ่น\nประคุณ\nประเคน\nประเคราะห์\nประโคน\nประโคนธรรพ\nประโคนธรรพ์\nประโคม\nประจง\nประจญ\nประจบ\nประจบประแจง\nประจวบ\nประจ๋อประแจ๋\nประจักษ์\nประจักษนิยม\nประจัญ\nประจัน\nประจาก\nประจาค\nประจาน\nประจำ\nประจิม\nประจิ้มประเจ๋อ\nประจุ\nประจุคมน์\nประจุบัน\nประเจก\nประเจิด\nประเจิดประเจ้อ\nประเจียด\nประแจ\nประชด\nประชน\nประชวม\nประชวร\nประชัน\nประชา\nประชาธิปไตย\nประชิด\nประชี\nประชุม\nประเชิญ\nประณต\nประณม\nประณาม\nประณิธาน\nประณิธิ\nประณีต\nประณุท\nประดง\nประดน\nประดวน\nประดอน\nประดอย\nประดัก\nประดักประเดิด\nประดัง\nประดับ\nประดา\nประดาก\nประดาป\nประดาษ\nประดิชญา\nประดิดประดอย\nประดิทิน\nประดิษฐ์\nประดิษฐกรรม\nประดิษฐาน\nประดุง\nประดุจ\nประดู่\nประเด\nประเด็น\nประเดิม\nประเดียง\nประเดี๋ยว\nประเดี๋ยวประด๋าว\nประแดง\nประแดะ\nประโดง\nประโดย\nประตง\nประตัก\nประตาป\nประติชญา\nประติญาณ\nประติทิน\nประติมากร\nประติมากรรม\nประติรพ\nประตู\nประถม\nประถมจินดา\nประทม\nประท้วง\nประทวน\nประทักษ์\nประทักษิณ\nประทัง\nประทัด\nประทับ\nประทาน\nประทาย\nประทาศี\nประทิน\nประทิ่น\nประทีป\nประทุฐ\nประทุน\nประทุษ\nประทุษฐ์\nประเทศ\nประเทา\nประเทียด\nประเทียบ\nประเทือง\nประธาน\nประธานาธิบดี\nประนม\nประนอ\nประนอม\nประนัง\nประนัปดา\nประนีประนอม\nประปราน\nประปราย\nประปา\nประเปรี้ยง\nประเปรียว\nประพจน์\nประพนธ์\nประพรม\nประพฤติ\nประพฤทธิ์\nประพัด\nประพัทธ์\nประพันธ์\nประพาต\nประพาส\nประพาสมหรณพ\nประพาฬ\nประพิณ\nประพิมพ์ประพาย\nประพุทธ์\nประเพณี\nประโพธ\nประไพ\nประไพร\nประภพ\nประภัสสร\nประภา\nประภาคาร\nประภาพ\nประภาษ\nประภาส\nประเภท\nประมง\nประมวล\nประมาณ\nประมาท\nประมุข\nประมุท\nประมูล\nประเมิน\nประโมง\nประโมทย์\nประยงค์\nประยุกต์\nประยุทธ์\nประยุร\nประยูร\nประโยค\nประโยชน์\nประโรหิต\nประลมพ์\nประลอง\nประลัย\nประลาต\nประลาย\nประลุ\nประเล่ห์\nประเล้าประโลม\nประโลม\nประวรรต\nประวรรตน์\nประวัติ\nประวาล\nประวาลปัทม์\nประวาส\nประวิง\nประวิช\nประวิตร\nประวิน\nประวีณ\nประเวณี\nประเวศ\nประเวศน์\nประศม\nประศาสน์\nประศุ\nประสก\nประสงค์\nประสบ\nประสพ\nประสม\nประสะ\nประสัก\nประสันนาการ\nประสัยห์\nประสา\nประสาท\nประสาธน์\nประสาน\nประสาร\nประสิทธิ์\nประสิทธิผล\nประสิทธิภาพ\nประสีประสา\nประสูต\nประสูติ\nประเสบัน\nประเสบันอากง\nประเสริฐ\nประหนึ่ง\nประหม่า\nประหยัด\nประหลาด\nประหล่ำ\nประหวัด\nประหวั่น\nประหัตประหาร\nประหาณ\nประหาร\nประเหล\nประเหส\nประไหมสุหรี\nประอบ\nประอร\nปรัก\nปรักปรำ\nปรักมะ\nปรัง\nปรัชญา\nปรัตถจริยา\nปรัตยุบัน\nปรัน\nปรับ\nปรัมปรา\nปรัศจิม\nปรัศนา\nปรัศนี\nปรัศว์\nปรัสสบท\nปร่า\nปรากฏ\nปรากรม\nปรากฤต\nปราการ\nปราง\nปรางค์\nปราจีน\nปราชญ์\nปราชญา\nปราชัย\nปราณ\nปราณี\nปราด\nปราน\nปรานี\nปราบ\nปราบดาภิเษก\nปราปต์\nปราม\nปรามาส\nปราโมช\nปราโมทย์\nปราย\nปรารถนา\nปรารภ\nปรารมภ์\nปราศ\nปราศจาก\nปราศรัย\nปราษณี\nปราษาณ\nปราสัย\nปราสาท\nปรำ\nปริ\nปริก\nปริกขาร\nปริกรรม\nปริกัป\nปริคณห์\nปริจาค\nปริจาริกา\nปริเฉท\nปริชน\nปริซึม\nปริญญา\nปริณาม\nปริณายก\nปริต\nปริตตะ\nปริตโตทก\nปริตร\nปริทรรศน์\nปริทัยหัคคี\nปริทัศน์\nปริเทพ\nปริเทวะ\nปรินิพพาน\nปริบ\nปริบท\nปริปันถ์\nปริพนธ์\nปริพัตร\nปริพันธ์\nปริพาชก\nปริภัณฑ์\nปริภาษ\nปริภูมิ\nปริโภค\nปริ่ม\nปริมณฑล\nปริมาณ\nปริมาตร\nปริยัติ\nปริยานุช\nปริยาย\nปริเยศ\nปริโยสาน\nปริวรรต\nปริวัตร\nปริวาร\nปริวาส\nปริวิตก\nปริเวณ\nปริศนา\nปริษัท\nปริสัญญู\nปริสุทธิ\nปริหาน\nปริหาร\nปรี่\nปรีชญา\nปรีชา\nปรี๊ด\nปรีดา\nปรีดิ\nปรีดิ์\nปรีดี\nปรีติ\nปรียะ\nปรียา\nปรึก\nปรึกษา\nปรึง\nปรือ\nปรื๋อ\nปรุ\nปรุง\nปรู\nปรู๋\nปรูด\nปรู๊ด\nปรู๊ดปร๊าด\nปรู๊ฟ\nปฤงคพ\nปฤจฉา\nปฤษฎางค์\nปฤษฐ\nปลก\nปลกเปลี้ย\nปลง\nปล่ง\nปลด\nปล้น\nปลวก\nปลอก\nปล่อง\nปล้อง\nปลอด\nปล้อน\nปลอบ\nปลอม\nปล่อย\nปละ\nปลัก\nปลั๊ก\nปลัง\nปลั่ง\nปลัด\nปลัดขิก\nปลา\nปลาต\nปลาบ\nปลาย\nปลาสเตอร์\nปลาสนาการ\nปล้ำ\nปลิง\nปลิด\nปลิ้น\nปลิโพธ\nปลิม\nปลิ่ม\nปลิว\nปลี\nปลีก\nปลื้ม\nปลุก\nปลูก\nปวกเปียก\nปวง\nป่วง\nปวด\nป่วน\nป้วน\nป้วนเปี้ยน\nป่วย\nปวัตน์\nปวารณา\nปวาล\nปวาส\nปวาฬ\nปวิช\nปวิตร\nปวิธ\nปวิเวก\nปวีณ\nปวุติ\nปเวณี\nปเวส\nปเวสน์\nปศุ\nปสันนะ\nปสันนาการ\nปสาท\nปสาน\nปสาสน์\nปสุ\nปสุต\nปสูติ\nปหังสนะ\nปหาน\nปหาร\nปหาส\nปอ\nป้อ\nป๋อ\nปอก\nปอง\nป่อง\nป้อง\nปอด\nปอน\nป้อน\nปอนด์\nปอเนาะ\nปอบ\nป้อแป้\nปอม\nป้อม\nป๋อม\nปอย\nป้อย\nป้อยอ\nปะ\nปะกน\nปะกัง\nปะการัง\nปะกำ\nปะขาว\nปะงาบ\nปะตาปา\nปะตาระกาหลา\nปะติดปะต่อ\nปะติยาน\nปะทะ\nปะทะปะทัง\nปะทุ\nปะทุน\nปะปน\nปะมง\nปะราลี\nปะรำ\nปะไร\nปะลอม\nปะเลง\nปะแล่ม\nปะโลง\nปะวะหล่ำ\nปะหงับ\nปะหนัน\nปะหัง\nปะเหลาะ\nปัก\nปักข์\nปักเป้า\nปักษ์\nปักษา\nปักษิน\nปักษี\nปัง\nปั๋ง\nปังสุ์\nปังสุกุล\nปัจจัตตะ\nปัจจันต์\nปัจจันตคาม\nปัจจันตชนบท\nปัจจันตประเทศ\nปัจจัย\nปัจจามิตร\nปัจจุคมน์\nปัจจุทธรณ์\nปัจจุบัน\nปัจจุสมัย\nปัจเจก\nปัจโจปการกิจ\nปัจฉา\nปัจฉิม\nปัจถรณ์\nปัจนึก\nปัจยาการ\nปัจเวกขณ์\nปัชชุน\nปัญจนที\nปัญจวัคคีย์\nปัญจก\nปัญจกะ\nปัญจมี\nปัญจวีสติ\nปัญญัติ\nปัญญา\nปัญหา\nปัฏ\nปัฏนะ\nปัฐยาวัต\nปัณฑรหัตถี\nปัณณะ\nปัณณาส\nปัณณาสก์\nปัณรสี\nปัณหิ\nปัด\nปัตคาด\nปัตตะ\nปัตตานึก\nปัตตานุโมทนา\nปัตตาเวีย\nปัตติ\nปัตติก\nปัตถร\nปัตถะ\nปัตนิ\nปัตนี\nปัตหล่า\nปัถพี\nปัถวี\nปัทม์\nปัทมะ\nปัทมาสน์\nปัน\nปั่น\nปั้น\nปั้นจั่น\nปันจุเหร็จ\nปั้นลม\nปั้นหยา\nปั้นเหน่ง\nปับ\nปั๊บ\nปัปผาสะ\nปัพพาชนียกรรม\nปัพภาระ\nปั๊ม\nปัยกา\nปัยยิกา\nปัวเปีย\nปัศจิม\nปัศตัน\nปัสสาวะ\nปัสสาสะ\nปา\nป่า\nป้า\nปาก\nปาง\nป้าง\nปาจรีย์\nปาจิตตีย์\nปาจีน\nปาฏลิ\nปาฏิบท\nปาฏิบุคลิก\nปาฏิโภค\nปาฏิหาริย์\nปาฐก\nปาฐกถา\nปาฐะ\nปาณกชาติ\nปาณะ\nปาณาติบาต\nปาณิ\nปาณี\nปาด\nปาติโมกข์\nปาตี\nปาเต๊ะ\nปาท่องโก๋\nปาทังกา\nปาทาน\nปาทุกา\nปาน\nป่าน\nป้าน\nปานะ\nปานียะ\nป้าบ\nป๊าบ\nปาพจน์\nปาม\nปาโมกข์\nป่าย\nป้าย\nปายาส\nปาร์เกต์\nปารมี\nปารเมศ\nปาราชิก\nปาริฉัตร\nปาริชาต\nปารุสกวัน\nปาล\nปาล์ม\nปาลิไลยก์\nปาลี\nปาว\nป่าว\nป๊าว\nปาวาร\nปาษาณ\nปาส\nปาสาณ\nปาสาทิกะ\nปาหนัน\nปาหี่\nปำ\nป้ำ\nป้ำเป๋อ\nปิกนิก\nปิ้ง\nปิงคละ\nปิงปอง\nปิฎก\nปิฏฐะ\nปิฐิ\nปิณฑะ\nปิด\nปิตตะ\nปิตา\nปิตามหัยกา\nปิตามหัยยิกา\nปิตุ\nปิตุจฉา\nปิตุภูมิ\nปิตุลา\nปิโตรเลียม\nปิ่น\nปิ่นแก้ว\nปิ่นโต\nปิปผลี\nปิ่ม\nปิ้ม\nปิยะ\nปิยังคุ\nปิโยรส\nปิลันธน์\nปิ๋ว\nปิศาจ\nปิหกะ\nปี\nปี่\nปี้\nปี๋\nปีก\nปีฐะ\nปี๊ด\nปีติ\nปีน\nปีบ\nปี๊บ\nปีศาจ\nปีฬกะ\nปึก\nปึง\nปึ่ง\nปึ๋ง\nปึ้ด\nปึมปื้อ\nปืน\nปื้น\nปือ\nปื้อ\nปื๋อ\nปุ\nปุ๊\nปุก\nปุกปุย\nปุคละ\nปุ้งกี๋\nปุงควะ\nปุงลิงค์\nปุงลึงค์\nปุจฉา\nปุฏะ\nปุณฑริก\nปุด\nปุตตะ\nปุถุชน\nปุนนาค\nปุนภพ\nปุนัพสุ\nปุบ\nปุ๊บ\nปุปผะ\nปุปะ\nปุพพะ\nปุ่ม\nปุ่มป่ำ\nปุ้ม\nปุ๋ม\nปุย\nปุ้ย\nปุ๋ย\nปุรณะ\nปุระ\nปุราณ\nปุราณะ\nปุริมพรรษา\nปุเรจาริก\nปุโรหิต\nปุลลิงค์\nปุลลึงค์\nปุลินท์\nปุโลปุเล\nปุษยะ\nปุสสะ\nปู\nปู่\nปูชกะ\nปูชนียบุคคล\nปูชนียวัตถุ\nปูชนียะ\nปูชา\nปูชิต\nปูด\nปูน\nปูม\nปู้ยี่ปู้ยำ\nปูระ\nปูลู\nเป้\nเป๋\nเปก\nเป๊ก\nเป่ง\nเป้ง\nเป๋ง\nเป็ด\nเปตพลี\nเปตอง\nเป็น\nเปยยาล\nเปร็ง\nเปรต\nเปรม\nเปรย\nเปรอ\nเปรอะ\nเปราะ\nเปรียง\nเปรี้ยง\nเปรี้ยงปร้าง\nเปรียญ\nเปรียบ\nเปรี่ยม\nเปรียว\nเปรี้ยว\nเปรียะ\nเปรี๊ยะ\nเปรื่อง\nเปรื้อย\nเปล\nเปล่ง\nเปลว\nเปลา\nเปล่า\nเปล้า\nเปลาะ\nเปลี้ย\nเปลี่ยน\nเปลี่ยม\nเปลี่ยว\nเปลือก\nเปลือง\nเปลื้อง\nเปลือย\nเปศะ\nเปศัส\nเปสการ\nเปสละ\nเปสุญวาท\nเป๋อ\nเป้อเย้อ\nเปอร์\nเปอร์เซ็นต์\nเปะ\nเปา\nเป่า\nเป้า\nเป๊า\nเป๋า\nเป๋าฮื้อ\nเปาะ\nเป๊าะ\nเปาะเปี๊ยะ\nเปาะแปะ\nเปิก\nเปิง\nเปิงมาง\nเปิด\nเปิ่น\nเปิบ\nเปิ๊บ\nเปีย\nเปียก\nเปี๊ยก\nเปี๊ยบ\nเปี่ยม\nเปี้ยว\nเปี๊ยว\nเปียะ\nเปี๊ยะ\nเปือก\nเปื้อน\nเปื่อย\nแป\nแป้ง\nแป๋ง\nแปด\nแป๊ด\nแปทู\nแป้น\nแป๊น\nแปบ\nแป๊บ\nแปม\nแปร\nแปร๋\nแปรก\nแปรง\nแปร่ง\nแปร๊ด\nแปร้น\nแปร๋น\nแปรปรวน\nแประ\nแปล\nแปล้\nแปลก\nแปลง\nแปลน\nแปลบ\nแปล๊บ\nแป้ว\nแป๊ว\nแปะ\nแป๊ะ\nแป๊ะซะ\nโป\nโป่\nโป้\nโป๊\nโปก\nโป๊ก\nโป๊กเกอร์\nโปกขรณี\nโปกขรพรรษ\nโปเก\nโปง\nโป่ง\nโป้ง\nโป่งข่าม\nโปงลาง\nโป่งวิด\nโปฐบท\nโปฐปทมาส\nโปดก\nโปตถกะ\nโปน\nโป๊ป\nโป๊ยเซียน\nโปรแกรม\nโปรง\nโปร่ง\nโปรด\nโปรตอน\nโปรตีน\nโปรเตสแตนต์\nโปรแทรกเตอร์\nโปรย\nโปล่ง\nโปลิโอ\nโปโล\nโปสก\nโปสต์การ์ด\nโปะ\nโป๊ะ\nโป๊ะจ้าย\nไป\nไป่\nไป๋\nไปยาล\nไปรษณีย์\nไปรษณียบัตร\nไปรษณียภัณฑ์\nไปรษณียากร\nไปล่\nไปศาจ\nผก\nผกา\nผกาย\nผคม\nผง\nผงก\nผงม\nผงะ\nผงาด\nผง่าน\nผจง\nผจญ\nผจัญ\nผจาน\nผชุม\nผณิน\nผณินทร\nผณิศวร\nผด\nผดุง\nผเดิน\nผทม\nผนวก\nผนวช\nผนัง\nผนิด\nผนึก\nผม\nผยอง\nผรณาปีติ\nผรสุ\nผริต\nผรุสวาท\nผล\nผลคุน\nผลคุนี\nผล็อย\nผละ\nผลัก\nผลัด\nผลับ\nผลัวะ\nผลา\nผลาญ\nผลานิสงส์\nผลาผล\nผลาหาร\nผลิ\nผลิกะ\nผลิต\nผลิน\nผลี\nผลีผลาม\nผลึก\nผลึ่ง\nผลือ\nผลุ\nผลุง\nผลุด\nผลุน\nผลุนผลัน\nผลุบ\nผลุบผลับ\nผลุย\nผลู\nผวน\nผวย\nผวา\nผสม\nผสาน\nผอง\nผ่อง\nผ่อน\nผอบ\nผอม\nผ็อย\nผอูน\nผะ\nผะดา\nผะสา\nผัก\nผัคคุณ\nผัคคุณี\nผัง\nผัด\nผัน\nผับ\nผัว\nผัวะ\nผัสสะ\nผัสสาหาร\nผา\nผ่า\nผ้า\nผาก\nผาง\nผ่าง\nผาณิต\nผาด\nผาติ\nผ่าน\nผาม\nผาย\nผ่ายผอม\nผาล\nผาลคุน\nผาลา\nผ่าว\nผาสุก\nผ้าฮาด\nผำ\nผ้ำ\nผิ\nผิง\nผิด\nผิตะ\nผิน\nผิว\nผี\nผี้ว์\nผึง\nผึ่ง\nผึ้ง\nผึ้งรวง\nผืน\nผื่น\nผุ\nผุด\nผุยผง\nผุสราคา\nผู้\nผูก\nเผ\nเผง\nเผชิญ\nเผ็ด\nเผด็จ\nเผดิม\nเผดียง\nเผ่น\nเผนิก\nเผย\nเผยอ\nเผยิบ\nเผยิบผยาบ\nเผล\nเผล่\nเผล็ด\nเผลอ\nเผลอไผล\nเผละ\nเผลาะ\nเผลาะแผละ\nเผลียง\nเผอเรอ\nเผอิญ\nเผอิล\nเผะ\nเผา\nเผ่า\nเผ้า\nเผาะ\nเผิ้ง\nเผิน\nเผียน\nเผือ\nเผื่อ\nเผือก\nเผือด\nเผือน\nเผื่อน\nแผ่\nแผก\nแผง\nแผด\nแผน\nแผ่น\nแผนก\nแผล\nแผลง\nแผล็บ\nแผล็ว\nแผละ\nแผ่ว\nแผ้ว\nโผ\nโผง\nโผฏฐัพพะ\nโผน\nโผเผ\nโผย\nโผล่\nโผลกเผลก\nโผละ\nโผอน\nโผะ\nไผ\nไผ่\nไผท\nฝน\nฝรั่ง\nฝรั่งเศส\nฝ่อ\nฝอย\nฝัก\nฝัง\nฝั่ง\nฝัด\nฝัน\nฝา\nฝ่า\nฝ้า\nฝาก\nฝาง\nฝาด\nฝาน\nฝาย\nฝ่าย\nฝ้าย\nฝิ่น\nฝี\nฝี่\nฝีก\nฝึก\nฝืด\nฝืน\nฝุ่น\nฝูง\nเฝ้า\nเฝือ\nเฝือก\nเฝือง\nเฝื่อน\nแฝก\nแฝง\nแฝด\nใฝ่\nไฝ\nพก\nพกุล\nพง\nพงศ์\nพงศกร\nพงศธร\nพงศา\nพงศาวดาร\nพจน์\nพจนา\nพจนานุกรม\nพจนารถ\nพจนีย์\nพจมาน\nพจี\nพชระ\nพญา\nพญาลอ\nพณิช\nพณิชย์\nพดด้วง\nพธู\nพนันดร\nพนาดร\nพนาดอน\nพนาราม\nพนาลัย\nพนาลี\nพนาวาส\nพนาเวศ\nพนาศรม\nพนาสณฑ์\nพนาสัณฑ์\nพเนจร\nพ่น\nพ้น\nพนม\nพนอง\nพนอม\nพนัก\nพนักงาน\nพนัง\nพนัน\nพนัส\nพนา\nพนาย\nพนิดา\nพนิต\nพบ\nพม่า\nพยนต์\nพยศ\nพยัก\nพยักพเยิด\nพยัคฆ์\nพยัคฆา\nพยัคฆิน\nพยัคฆี\nพยัชน์\nพยัญชนะ\nพยัต\nพยับ\nพยากรณ์\nพยาฆร์\nพยางค์\nพยาธิ\nพยาน\nพยาบาท\nพยาบาล\nพยาม\nพยามะ\nพยายาม\nพยุ\nพยุง\nพยุหยาตรา\nพยุหโยธา\nพยุหเสนา\nพยุหะ\nพยู่ห์\nพเยีย\nพร\nพรต\nพรม\nพรรค\nพรรค์\nพรรคานต์\nพรรณ\nพรรณนา\nพรรดึก\nพรรลาย\nพรรษ\nพรรษา\nพรรเอิญ\nพรวด\nพรวน\nพรหม\nพรหมจรรย์\nพรหมจาริณี\nพรหมจารี\nพรหมา\nพรหมาสตร์\nพรหมินทร์\nพรอก\nพร่อง\nพร้อง\nพรอด\nพร้อม\nพร้อมพรัก\nพร่อย\nพร้อย\nพระ\nพระนอม\nพระนาด\nพระฮาม\nพรักพร้อม\nพรั่ง\nพรัด\nพรั่น\nพรับ\nพร่า\nพร้า\nพราก\nพราง\nพร่าง\nพราด\nพราน\nพราย\nพราว\nพราหมณ์\nพราหมณะ\nพราหมณี\nพราหมี\nพรำ\nพร่ำ\nพริก\nพริ้ง\nพริบ\nพริ้ม\nพรึง\nพรึน\nพรึบ\nพรึ่บ\nพรืด\nพรุ\nพรุ่ง\nพรุน\nพรู\nพรูด\nพฤกษ์\nพฤกษชาติ\nพฤกษเทวดา\nพฤกษราช\nพฤกษศาสตร์\nพฤกษา\nพฤฒ\nพฤฒา\nพฤฒาจารย์\nพฤฒิ\nพฤต\nพฤติ\nพฤทธ์\nพฤทธิ์\nพฤนต์\nพฤนท์\nพฤศจิก\nพฤศจิกายน\nพฤษภ\nพฤษภาคม\nพฤหัสบดี\nพล\nพละ\nพลากร\nพลาดิศัย\nพลาธิการ\nพลานามัย\nพลบ\nพลวก\nพลวง\nพลวัต\nพลศาสตร์\nพลอ\nพล้อ\nพลอง\nพลอด\nพลอน\nพลอมแพลม\nพลอย\nพล่อย\nพลั่ก\nพลัง\nพลั่ง\nพลั้ง\nพลัด\nพลัน\nพลับ\nพลับพลา\nพลับพลึง\nพลั่ว\nพล่า\nพลาง\nพลาญ\nพลาด\nพล่าน\nพลาม\nพล่าม\nพลาย\nพลาสติก\nพลาสมา\nพลาหก\nพลำ\nพล้ำ\nพลำภัง\nพลิก\nพลิพัท\nพลิ้ว\nพลี\nพลุ\nพลุก\nพลุ่ก\nพลุกพล่าน\nพลุ่ง\nพลุ้ย\nพลู\nพลูโต\nพลูโทเนียม\nพวก\nพวง\nพ่วง\nพวน\nพวย\nพสก\nพสุ\nพสุธา\nพสุสงกรานต์\nพหล\nพหุ\nพหุล\nพหู\nพอ\nพ่อ\nพ้อ\nพอก\nพอง\nพ้อง\nพอน\nพ้อม\nพอโลเนียม\nพะ\nพะงา\nพะงาบ\nพะจง\nพะทำมะรง\nพะนอ\nพะเน้าพะนอ\nพะเนิน\nพะเนียง\nพะแนง\nพะพาน\nพะพิง\nพะเพิง\nพะยอม\nพ่ะย่ะค่ะ\nพะยูง\nพะยูน\nพะเยิบ\nพะเยิบพะยาบ\nพะรุงพะรัง\nพะโล้\nพะไล\nพะวง\nพะวักพะวน\nพะวา\nพะว้าพะวัง\nพะอง\nพะอากพะอำ\nพะอืดพะอม\nพัก\nพักตร์\nพักตรา\nพักร\nพัง\nพังกา\nพังงา\nพังผืด\nพังพวย\nพังพอน\nพังพาน\nพังพาบ\nพังเพย\nพัช\nพัชนี\nพัชระ\nพัญจน์\nพัฒนะ\nพัฒนา\nพัฒนากร\nพัฒนาการ\nพัด\nพัดชา\nพัดดึงส์\nพัตร\nพัทธ์\nพัทธยา\nพัทธสีมา\nพัทร\nพัน\nพันตู\nพันทาง\nพันธ์\nพันธะ\nพันธกรณี\nพันธบัตร\nพันธมิตร\nพันธนะ\nพันธนาคาร\nพันธนาการ\nพันธุ์\nพันธุกรรม\nพันลึก\nพันลือ\nพันเลิศ\nพันเอิญ\nพับ\nพัลลภ\nพัลวัน\nพัว\nพัวะ\nพัศดี\nพัสดุ\nพัสตร์\nพัสถาน\nพา\nพาก\nพากเพียร\nพากย์\nพาง\nพ่าง\nพาชี\nพาณิช\nพาณิชย์\nพาณิชยกรรม\nพาณิชยการ\nพาณิชยศาสตร์\nพาณิชยศิลป์\nพาณินี\nพาณี\nพาณีนี\nพาด\nพาท\nพาทย์\nพาธ\nพาธา\nพาน\nพ่าน\nพานร\nพานรินทร์\nพาม\nพาย\nพ่าย\nพายม้า\nพายัพ\nพายุ\nพาร์เซก\nพารณ\nพารา\nพาราฟิน\nพาล\nพาลา\nพาลี\nพาลุก\nพาโล\nพาไล\nพาส\nพาสน์\nพาสนา\nพาสุกรี\nพ่าห์\nพาหนะ\nพาหะ\nพาหา\nพาหิรกะ\nพาหิระ\nพาหุ\nพาหุรัด\nพาหุสัจจะ\nพาเหียร\nพาฬ\nพำ\nพำนัก\nพำพึม\nพำลา\nพิกล\nพิกสิต\nพิกัด\nพิกัติ\nพิกัน\nพิการ\nพิกุล\nพิเคราะห์\nพิฆน์\nพิฆเนศ\nพิฆเนศวร\nพิฆาต\nพิง\nพิจัย\nพิจาร\nพิจารณ์\nพิจารณา\nพิจิก\nพิจิต\nพิจิตร\nพิชญ์\nพิชัย\nพิชาน\nพิชิต\nพิเชฐ\nพิเชียร\nพิฑูรย์\nพิณ\nพิดทูล\nพิดาน\nพิโดร\nพิตร\nพิถี\nพิถีพิถัน\nพิทย\nพิทย์\nพิทยา\nพิทยาคม\nพิทยาคาร\nพิทยาธร\nพิทยาลัย\nพิทักษ์\nพิทูร\nพิเทศ\nพิธาน\nพิธี\nพิธุ\nพินทุ\nพินอบพิเทา\nพินัย\nพินาศ\nพินิจ\nพินิต\nพินิศ\nพินิศจัย\nพิเนต\nพิบัติ\nพิบุล\nพิบูล\nพิปริต\nพิปลาส\nพิพรรธ\nพิพรรธน์\nพิพักพิพ่วน\nพิพัฒ\nพิพัฒน์\nพิพากษ์\nพิพากษา\nพิพาท\nพิพิธ\nพิพิธภัณฑ์\nพิพิธภัณฑสถาน\nพิภพ\nพิภัช\nพิภาค\nพิภูษณะ\nพิเภก\nพิมปะการัง\nพิมพ์\nพิมพการัง\nพิมพา\nพิมพาภรณ์\nพิมล\nพิมเสน\nพิมาน\nพิมุข\nพิโมกข์\nพิโมกษ์\nพิโยกพิเกน\nพิโยค\nพิรอด\nพิระ\nพิรากล\nพิราบ\nพิราม\nพิราลัย\nพิริยะ\nพิรี้พิไร\nพิรุณ\nพิรุธ\nพิรุฬห์\nพิเรนทร์\nพิเราะ\nพิโรธ\nพิไร\nพิลังกาสา\nพิลาป\nพิลาส\nพิลิปดา\nพิลึก\nพิลึกกึกกือ\nพิลึกพิลั่น\nพิโลน\nพิไล\nพิศ\nพิศวง\nพิศวาส\nพิศาล\nพิศุทธ์\nพิศุทธิ์\nพิเศษ\nพิษ\nพิษฐาน\nพิษนาศน์\nพิสดาร\nพิสมร\nพิสมัย\nพิสัง\nพิสัช\nพิสัย\nพิสิฐ\nพิสุทธิ์\nพิสูจน์\nพิหค\nพิหาร\nพิฬาร\nพี\nพี่\nพี้\nพีชคณิต\nพีระมิด\nพึง\nพึ่ง\nพึ่บ\nพึ่บพั่บ\nพึม\nพึมพำ\nพืช\nพืด\nพื้น\nพุ\nพุก\nพุกาม\nพุง\nพุ่ง\nพุงดอ\nพุฒ\nพุฒิ\nพุด\nพุดตาน\nพุทธ\nพุทธะ\nพุทธังกูร\nพุทธางกูร\nพุทธันดร\nพุทธาภิเษก\nพุทธาวาส\nพุทธิ\nพุทโธ่\nพุทรา\nพุธ\nพุ่ม\nพุมเรียง\nพุ้ย\nพู\nพูพอน\nพู่\nพูด\nพูน\nพู้น\nพู่ระหง\nเพ\nเพ็ก\nเพกา\nเพคะ\nเพ็ง\nเพ่ง\nเพ็จ\nเพชฉลูกรรม\nเพชฌฆาต\nเพชร\nเพชรดา\nเพชรปาณี\nเพชรฤกษ์\nเพชรายุธ\nเพชรกลับ\nเพชรสังฆาต\nเพชรหลีก\nเพชรหึง\nเพ็ญ\nเพฑูริย์\nเพณี\nเพ็ดทูล\nเพดาน\nเพท\nเพทนา\nเพทาย\nเพทุบาย\nเพโทบาย\nเพ่นพ่าน\nเพนียด\nเพไนย\nเพ้ย\nเพรง\nเพรซีโอดิเมียม\nเพรา\nเพราะ\nเพริด\nเพริศ\nเพรียก\nเพรียง\nเพรียบ\nเพรี้ยม\nเพรียว\nเพรื่อ\nเพรือง\nเพล\nเพลง\nเพล็ด\nเพล้โพล้\nเพลา\nเพลาะ\nเพลิง\nเพลิดเพลิน\nเพลิน\nเพลีย\nเพลี้ย\nเพลี่ยง\nเพศ\nเพส\nเพสลาด\nเพ่อ\nเพ้อ\nเพ้อเจ้อ\nเพอิญ\nเพะ\nเพา\nเพาะ\nเพิก\nเพิง\nเพิ่ง\nเพิดเพ้ย\nเพิ่ม\nเพี้ย\nเพียง\nเพี้ยง\nเพียงออ\nเพี้ยน\nเพียบ\nเพียร\nเพื่อ\nเพื่อน\nแพ\nแพ้\nแพง\nแพ่ง\nแพงพวย\nแพทย์\nแพทยศาสตร์\nแพน\nแพ่น\nแพนก\nแพนงเชิง\nแพร\nแพร่\nแพรก\nแพร่ง\nแพรว\nแพร้ว\nแพลง\nแพลทินัม\nแพล็บ\nแพลม\nแพลเลเดียม\nแพละ\nแพละโลม\nแพว\nแพ้ว\nแพศย์\nแพศยา\nแพะ\nโพ\nโพก\nโพกพาย\nโพง\nโพงพาง\nโพชฌงค์\nโพซิตรอน\nโพด\nโพทะเล\nโพแทสเซียม\nโพธ\nโพธิ\nโพธิ์\nโพน\nโพ้น\nโพนทะนา\nโพบาย\nโพย\nโพยก๊วน\nโพยม\nโพรก\nโพรง\nโพรโทแอกทิเนียม\nโพรมีเทียม\nโพระดก\nโพล่\nโพลง\nโพล่ง\nโพล้ง\nโพลน\nโพล้เพล้\nโพละ\nโพสพ\nไพ\nไพ่\nไพจิตร\nไพชน\nไพชยนต์\nไพฑูรย์\nไพที\nไพบูลย์\nไพพรรณ\nไพร\nไพร่\nไพรจิตร\nไพรชน\nไพรชยนต์\nไพรฑูรย์\nไพรที\nไพรบูลย์\nไพรเราะ\nไพรัช\nไพรำ\nไพริน\nไพรินทร์\nไพรี\nไพเราะ\nไพโรจน์\nไพล\nไพล่\nไพศาขะ\nไพศาล\nไพเศษ\nไพสพ\nไพสิฐ\nไพหาร\nฟก\nฟ้ง\nฟรักโทส\nฟรี\nฟลูออรีน\nฟ่อ\nฟ้อ\nฟอก\nฟอง\nฟ่อง\nฟ้อง\nฟอด\nฟอน\nฟ่อน\nฟ้อน\nฟ้อแฟ้\nฟอร์มาลดีไฮด์\nฟอร์มาลิน\nฟอสฟอรัส\nฟอสเฟต\nฟัก\nฟักฟุ้น\nฟัง\nฟังก์ชัน\nฟัด\nฟัน\nฟั่น\nฟั้น\nฟ้า\nฟาก\nฟาง\nฟ่าง\nฟาด\nฟาทอม\nฟาน\nฟ่าม\nฟาย\nฟาร์ม\nฟาสซิสต์\nฟิด\nฟิต\nฟิบ\nฟิล์ม\nฟิวส์\nฟิสิกส์\nฟี่\nฟี้\nฟืดฟาด\nฟืน\nฟื้น\nฟืม\nฟุ\nฟุ้ง\nฟุต\nฟุน\nฟุบ\nฟุ่บ\nฟุ่มเฟือย\nฟุลสแก๊ป\nฟู\nฟู่\nฟูก\nฟูด\nฟูม\nเฟ็ด\nเฟ้น\nเฟลด์สปาร์\nเฟ้อ\nเฟอร์เมียม\nเฟอะ\nเฟอะฟะ\nเฟะ\nเฟะฟะ\nเฟิน\nเฟี้ยม\nเฟี้ยว\nเฟือ\nเฟื้อ\nเฟือง\nเฟื่อง\nเฟื้อง\nเฟือน\nเฟือย\nเฟื้อย\nแฟ่\nแฟง\nแฟชั่น\nแฟน\nแฟบ\nแฟ้ม\nแฟรนเซียม\nแฟลกซ์\nแฟลต\nแฟะ\nโฟกัส\nไฟ\nภควดี\nภควัต\nภควันต์\nภควัม\nภควา\nภควาน\nภคะ\nภคันทลา\nภคินี\nภณะ\nภณิดา\nภพ\nภมร\nภมริน\nภมรี\nภมุกา\nภยันตราย\nภยาคติ\nภระ\nภรณี\nภรต\nภรรดร\nภรรดา\nภรรยา\nภระมร\nภระมรี\nภราดร\nภราดรภาพ\nภราดา\nภริยา\nภฤศ\nภวะ\nภวตัณหา\nภวนะ\nภวังค์\nภวังคจิต\nภักดี\nภักตะ\nภักติ\nภักษ์\nภักษา\nภักษาหาร\nภัค\nภัคน์\nภังคะ\nภังคี\nภัจ\nภัณฑ์\nภัณฑาคาร\nภัณฑาคาริก\nภัณฑารักษ์\nภัณฑนะ\nภัณฑู\nภัต\nภัตตาคาร\nภัตตาหาร\nภัตร\nภัทระ\nภัทรกัป\nภัพ\nภัย\nภัสดา\nภัสตรา\nภัสมะ\nภัสสร\nภา\nภาค\nภาคย์\nภาคยานุวัติ\nภาคินี\nภาคิไนย\nภาคี\nภาคียะ\nภาชนะ\nภาชี\nภาณ\nภาณวาร\nภาณกะ\nภาณี\nภาณุ\nภาดร\nภาดา\nภาตระ\nภาตา\nภาตุ\nภาติกะ\nภาติยะ\nภาพ\nภาพย์\nภาม\nภาย\nภาร\nภาระ\nภารดี\nภารต\nภารตี\nภารยทรัพย์\nภารยา\nภารา\nภาวนา\nภาวะ\nภาษ\nภาษณ์\nภาษา\nภาษิต\nภาษี\nภาส\nภาสน์\nภาสวร\nภาสา\nภาสุระ\nภิกขา\nภิกขาจาร\nภิกขุ\nภิกขุนี\nภิกษา\nภิกษาจาร\nภิกษาหาร\nภิกษุ\nภิกษุณี\nภิงคาร\nภิญโญ\nภิตติ\nภินท์\nภินทนาการ\nภิยโย\nภิรมย์\nภิรมย์สุรางค์\nภิษัช\nภิสัก\nภีตะ\nภีมะ\nภีรุ\nภุกต์\nภุขัน\nภุช\nภุชงค์\nภุต\nภุมมะ\nภุมรัตน์\nภุมวาร\nภุมรา\nภุมริน\nภุมรี\nภุมเรศ\nภู\nภู่\nภูต\nภูติ\nภูม\nภูมิ\nภูมี\nภูริ\nภูรี\nภูวดล\nภูวนาถ\nภูวเนตร\nภูวไนย\nภูษา\nภูษิต\nเภกะ\nเภตรา\nเภท\nเภทุบาย\nเภรี\nเภสัช\nโภค\nโภคะ\nโภคิน\nโภคี\nโภไคย\nโภไคศวรรย์\nโภช\nโภชย์\nโภชก\nโภชนะ\nโภชนา\nโภชนาหาร\nโภชนียะ\nไภริน\nไภรี\nไภษัชคุรุ\nไภษัชย์\nมกร\nมกราคม\nมกุฎ\nมคธ\nมฆวัน\nมฆะ\nมฆา\nม่ง\nมงกุฎ\nมงโกรย\nมงคล\nมงคลวาร\nมณฑ์\nมณฑก\nมณฑนะ\nมณฑป\nมณฑล\nมณฑา\nมณฑารพ\nมณฑิระ\nมณเฑียร\nมณี\nมด\nมตะ\nมตกภัต\nมติ\nมทนะ\nมทะ\nมธุ\nมธุกร\nมธุการี\nมธุลีห์\nมธุระ\nมธุรพจน์\nมน\nมนินทรีย์\nม่น\nมนต์\nมนตร์\nมนตรี\nมนท์\nมนทิราลัย\nมนเทียร\nมนสิการ\nมนัส\nมนัสวี\nมนินทรีย์\nมนิมนา\nมนิลา\nมนุญ\nมนุษย์\nมนุษยชาติ\nมนุษยธรรม\nมนุษย์มนา\nมนุษยโลก\nมนุษยศาสตร์\nมนุษยสัมพันธ์\nมนุสาร\nมนู\nมนูสาร\nมโน\nมโนช\nมโนชญ์\nมโนราห์\nมโนสาเร่\nมโนห์รา\nมมังการ\nมยุรฉัตร\nมยุระ\nมยุรา\nมยุรี\nมยุเรศ\nมยูร\nมรกต\nมรคา\nมรฑป\nมรณ์\nมรณะ\nมรณกรรม\nมรณบัตร\nมรณภัย\nมรณภาพ\nมรดก\nมรรค\nมรรคา\nมรรตัย\nมรรยาท\nมรรษ\nมรสุม\nมริจ\nมริยาท\nมรีจิ\nมรุต\nมฤค\nมฤคย์\nมฤคศิระ\nมฤคศิรมาส\nมฤคเศียร\nมฤคินทร์\nมฤเคนทร์\nมฤดก\nมฤต\nมฤตยู\nมฤทุ\nมล\nมละ\nมลัก\nมลังเมลือง\nมล้าง\nมลาย\nมลายู\nมวก\nม่วง\nมวน\nม่วน\nม้วน\nม้วนต้วน\nมวย\nม้วย\nมวล\nมหกรรม\nมหรณพ\nมหรรณพ\nมหรสพ\nมหัจฉริยะ\nมหัต\nมหัทธนะ\nมหันต์\nมหันตโทษ\nมหัพภาค\nมหัศจรรย์\nมหา\nมหากฐิน\nมหากาฬ\nมหาขันธกะ\nมหาจักร\nมหาชน\nมหาชัย\nมหาชาติ\nมหาโชตรัต\nมหาดไทย\nมหาดเล็ก\nมหาตมะ\nมหาไถ่\nมหาเทพ\nมหาเทพี\nมหาเทวี\nมหาธาตุ\nมหานิกาย\nมหานิล\nมหาบพิตร\nมหาบัณฑิต\nมหาพน\nมหาพรหม\nมหาภารตะ\nมหาภิเนษกรมณ์\nมหาภูต\nมหาเมฆ\nมหายาน\nมหายุค\nมหาราช\nมหาฤกษ์\nมหาละลวย\nมหาละลาย\nมหาวงศ์\nมหาวรรค\nมหาวิทยาลัย\nมหาศักราช\nมหาศาล\nมหาสงกรานต์\nมหาสดมภ์\nมหาสดำ\nมหาสมุทร\nมหาสาวก\nมหาหงส์\nมหาหิงคุ์\nมหาอำนาจ\nมหาอุจ\nมหาอุด\nมหาอุปรากร\nมหาอุปราช\nมหิ\nมหิดล\nมหิธร\nมหิป\nมหิงส์\nมหิทธิ\nมหินท์\nมหิมา\nมหิศร\nมหิศวร\nมหิษ\nมหิษี\nมหึมา\nมเหยงค์\nมเหศ\nมเหศวร\nมเหศักดิ์\nมเหสักข์\nมเหสิ\nมเหสี\nมเหาฬาร\nมโหรสพ\nมโหระทึก\nมโหรี\nมโหฬาร\nมไหศวรรย์\nมอ\nมอง\nมองโกลอยด์\nมองโกเลีย\nมองคร่อ\nมอญ\nมอด\nม่อต้อ\nมอเตอร์\nมอเตอร์ไซค์\nมอน\nม่อน\nมอบ\nมอม\nมอมแมม\nม่อย\nมอร์ฟีน\nมอระกู่\nมอลโทส\nม่อลอกม่อแลก\nม่อห้อม\nม่อฮ่อม\nมะ\nมะกรูด\nมะกล่ำ\nมะกอก\nมะก่อง\nมะกะโรนี\nมะกา\nมะเกลือ\nมะเกี๋ยง\nมะข่วง\nมะขวิด\nมะขาม\nมะเขือ\nมะแข่น\nมะคังแดง\nมะค่า\nมะคำไก่\nมะคำดีควาย\nมะงั่ว\nมะงุมมะงาหรา\nมะซัก\nมะซาง\nมะดัน\nมะดีหวี\nมะดูก\nมะเดหวี\nมะเดื่อ\nมะต้อง\nมะตะบะ\nมะตาด\nมะตาหะรี\nมะตึ่ง\nมะตื๋น\nมะตูม\nมะแตก\nมะโต\nมะนาว\nมะปราง\nมะปริง\nมะฝ่อ\nมะพร้าว\nมะพลับ\nมะพูด\nมะแพน\nมะแพร้ว\nมะเฟือง\nมะแฟน\nมะไฟ\nมะม่วง\nมะม่าว\nมะมี่\nมะมื่น\nมะมุด\nมะเมอ\nมะเมีย\nมะเมื่อย\nมะแม\nมะยง\nมะยม\nมะระ\nมะริด\nมะรืน\nมะรุม\nมะรุมมะตุ้ม\nมะเร็ง\nมะเรื่อง\nมะโรง\nมะลอกมะแลก\nมะละกอ\nมะลิ\nมะลื่น\nมะลืมดำ\nมะลุลี\nมะแว้ง\nมะสัง\nมะเส็ง\nมะหวด\nมะหะหมัด\nมะหาด\nมะหิ่ง\nมะเหงก\nมะอึก\nมะฮอกกานี\nมัก\nมักกะโรนี\nมักกะลีผล\nมักกะสัน\nมักขะ\nมั่กขั้ก\nมักขิกา\nมัค\nมัคคะ\nมัคคุเทศก์\nมัคนายก\nมัฆวาน\nมั่ง\nมังกง\nมังกร\nมังกุ\nมังคละ\nมังค่า\nมังคุด\nมังตาน\nมังสวิรัติ\nมังสะ\nมังสี\nมัจจะ\nมัจจุ\nมัจฉริยะ\nมัจฉรี\nมัจฉะ\nมัจฉา\nมัชชะ\nมัชวิรัติ\nมัชชาระ\nมัชฌันติกสมัย\nมัชฌิม\nมัชฌิมา\nมัญจา\nมัญชิษฐา\nมัญชุ\nมัญชุสา\nมัญชูสา\nมัญเชฏฐะ\nมัฏฐะ\nมัณฑนศิลป์\nมัณฑนา\nมัด\nมัตตะ\nมัตตัญญู\nมัตตา\nมัตติกา\nมัตถกะ\nมัตถลุงค์\nมัตสยะ\nมัตสยา\nมัตสระ\nมัตสริน\nมัททวะ\nมัทนะ\nมัทยะ\nมัธยฐาน\nมัธยม\nมัธยันห์\nมัธยัสถ์\nมัน\nมั่น\nมันตา\nมันถะ\nมันทิระ\nมันทิราลัย\nมับ\nมั้ม\nมัมมี่\nมัย\nมัลละ\nมัลลิกา\nมัว\nมัวซัว\nมั่ว\nมัศยา\nมัสตุ\nมัสตาร์ด\nมัสมั่น\nมัสยิด\nมัสรู่\nมัสลิน\nมัสสุ\nมา\nม้า\nมาก\nมาคสิระ\nมาฆบูชา\nมาฆะ\nม้าง\nมางสะ\nมาณพ\nมาณวิกา\nมาด\nมาดา\nมาตงค์\nมาตร\nมาตรา\nมาตฤ\nมาตังคะ\nมาตา\nมาตามหัยกะ\nมาตามหัยกา\nมาตามหัยยิกา\nมาติกะ\nมาติกา\nมาตุ\nมาตุจฉา\nมาตุรงค์\nมาตุเรศ\nมาตุละ\nมาตุลา\nมาตุลานี\nมาทะ\nมาธยมิก\nมาธยมิกะ\nมาธุระ\nมาธุสร\nมาธูระ\nมาน\nม่าน\nม้าน\nมานพ\nมานะ\nมานัต\nมานัส\nมานิต\nมานี\nมานุษ\nมานุษยวิทยา\nมาโนชญ์\nมาบ\nมาภา\nม้าม\nม่าย\nมายา\nมาร\nมาราธิราช\nมารค\nมารดร\nมารดา\nมารยา\nมารยาท\nมารศรี\nมารษา\nมาริต\nมารุต\nมาลย์\nมาลัย\nมาลา\nมาลาตี\nมาลาเรีย\nมาลินี\nมาลี\nมาลุต\nมาศ\nมาส\nมาสก\nมาห์\nม่าห์\nมาหิส\nม่าเหมี่ยว\nมาฬก\nมิ\nมิค\nมิคสัญญี\nมิ่ง\nมิจฉา\nมิด\nมิตร\nมิติ\nมิเตอร์\nมิถยา\nมิถุน\nมิถุนายน\nมิทธะ\nมินตรา\nมินตา\nมินหม้อ\nมิ่ม\nมิ้ม\nมิไย\nมิรันตี\nมิลลิกรัม\nมิลลิบาร์\nมิลลิเมตร\nมิลลิลิตร\nมิลักขะ\nมิลักขู\nมิส\nมิสกรี\nมิสกวัน\nมิสซา\nมี\nมี่\nมีด\nมีเทน\nมีน\nมีนาคม\nมี่สั้ว\nมึง\nมึน\nมืด\nมืน\nมื่น\nมือ\nมื้อ\nมุ\nมุก\nมุกดา\nมุกดาหาร\nมุกุระ\nมุข\nมุขเด็จ\nมุขยประโยค\nมุโขโลกนะ\nมุคคะ\nมุง\nมุ่ง\nมุ้ง\nมุจฉา\nมุจนะ\nมุจลินท์\nมุญจนะ\nมุญชะ\nมุฐิ\nมุณฑกะ\nมุณฑะ\nมุด\nมุตกิด\nมุตฆาต\nมุตตะ\nมุตตา\nมุตติ\nมุตะ\nมุติ\nมุททา\nมุทธชะ\nมุทธา\nมุทธาภิเษก\nมุทรา\nมุทริกา\nมุทะลุ\nมุทา\nมุทิกา\nมุทิงค์\nมุทิตา\nมุทุ\nมุทุตา\nมุ่น\nมุนิ\nมุนินทร์\nมุนี\nมุบ\nมุบมิบ\nมุม\nมุ้ม\nมุ่ย\nมุรธา\nมุรธาภิเษก\nมุสละ\nมุสลิม\nมุสา\nมุสิก\nมุหงิด\nมุหน่าย\nมุหุต\nมุฮัมมัด\nมูก\nมูเซอ\nมูตร\nมู่ทู่\nมูน\nมูมมาม\nมูรติ\nมูรธา\nมูรธาภิเษก\nมูล\nมูละ\nมูลา\nมูลิกากร\nมู่ลี่\nมู่เล่\nมูสัง\nมูสิก\nมูสิกะ\nมูสิกทันต์\nเม\nเม็ก\nเมกะเฮิรตซ์\nเมขลา\nเมฆ\nเมฆา\nเมฆินทร์\nเมฆี\nเม็ง\nเม็ด\nเมตตา\nเมตไตรย\nเมตร\nเมตริก\nเมตริกตัน\nเมถุน\nเมท\nเมโท\nเมทนี\nเมทินี\nเมทนีดล\nเมทานอล\nเมทิลแอลกอฮอล์\nเมธ\nเมธา\nเมธาวี\nเมธี\nเมน\nเม่น\nเม้น\nเมนเดลีเวียม\nเมนทอล\nเม้ม\nเมรัย\nเมริเดียน\nเมรุ\nเมล์\nเมล็ด\nเมลือง\nเมษ\nเมษายน\nเมห์\nเมหนะ\nเมหะ\nเมะ\nเมา\nเม่า\nเม้า\nเมารี\nเมาลี\nเมาฬี\nเมาะ\nเมิง\nเมิน\nเมิล\nเมีย\nเมียง\nเมี่ยง\nเมี้ยน\nเมือ\nเมื้อ\nเมื่อ\nเมือก\nเมือง\nเมือบ\nเมื่อย\nแม่\nแม้\nแมก\nแมกนีเซียม\nแมง\nแมงกะพรุน\nแมงกานิน\nแมงกานีส\nแมงคา\nแมงคาเรือง\nแมงช้าง\nแมงดา\nแมงลัก\nแม่ตะงาว\nแมน\nแม่น\nแม้น\nแมลง\nแมลบ\nแมว\nแม้ว\nแมะ\nโม\nโม่\nโม้\nโมก\nโมกข์\nโมกษะ\nโมฆกรรม\nโมฆสัญญา\nโมฆะ\nโมฆียกรรม\nโมฆียะ\nโมง\nโม่ง\nโมงครุ่ม\nโมทนา\nโมโนแซ็กคาไรด์\nโมไนย\nโมเม\nโมเมนต์\nโมเย\nโมรา\nโมรี\nโมเรส\nโมลิบดีนัม\nโมลี\nโมเลกุล\nโมเสก\nโมเสส\nโม่ห์\nโมหะ\nโมหันธ์\nโมหาคติ\nโมโห\nไม่\nไม้\nไมกา\nไมครอน\nไมโครกรัม\nไมโครฟิล์ม\nไมโครโฟน\nไมโครมิเตอร์\nไมโครเมตร\nไมโครลิตร\nไมโครเวฟ\nไมตรี\nไมยราบ\nไมล์\nยก\nยกกระบัตร\nยกนะ\nยง\nยงโย่\nยชุรเวท\nยติ\nยติภังค์\nยถากรรม\nยถาภูตญาณ\nย่น\nยนต์\nยนตร์\nยม\nยมก\nยมโดย\nยมนา\nยมล\nยมะ\nยรรยง\nยล\nยวง\nยวด\nยวน\nยวบ\nย้วย\nยวรยาตร\nยศ\nยโส\nยอ\nย่อ\nยอก\nย็อกแย็ก\nยอง\nย่อง\nย้อง\nยอด\nยอน\nย้อน\nยอบ\nยอม\nย่อม\nย้อม\nย่อย\nย้อย\nย้อแย้\nยะ\nย่ะ\nยะยอบ\nยะยับ\nยัก\nยักข์\nยักขินี\nยักษ์\nยักษา\nยักษิณี\nยักษี\nยัง\nยั้ง\nยั่งยืน\nยัชโญปวีต\nยัญ\nยัญญะ\nยัด\nยัติภังค์\nยัน\nยั่น\nยันต์\nยันตร\nยันตร์\nยันตรกรรม\nยั่นตะนี\nยับ\nยั่ว\nยั้ว\nยั้วเยี้ย\nยัวรยาตร\nยัวะ\nยัษฏิ\nยา\nย่า\nยาก\nยาคุ\nยาคู\nยาง\nย่าง\nยางพารา\nยาจก\nยาจนา\nยาไฉน\nยาด\nยาดา\nยาตร\nยาตรา\nยาน\nย่าน\nย่านพาโหม\nยานมาศ\nยานุมาศ\nยานี\nยาม\nย่าม\nยามะ\nยามักการ\nยามา\nยาย\nย้าย\nยายี\nยาว\nย้าว\nยาวกาลิก\nยาวชีวิก\nยาสูบ\nย่าหยา\nยาหยี\nยำ\nย่ำ\nย้ำ\nยำเยีย\nยิก\nยิง\nยิ่ง\nยิฏฐะ\nยิน\nยิบ\nยิบหยี\nยิปซัม\nยิปซี\nยิ้ม\nยิมนาสติก\nยิหวา\nยี\nยี่\nยี้\nยี่ก่า\nยี่เก\nยี่เข่ง\nยี่โถ\nยีน\nยี่โป้\nยี่ภู่\nยีราฟ\nยี่สก\nยี่สง\nยี่สน\nยี่สาน\nยี่สุ่น\nยี่หระ\nยี่หร่า\nยี่ห้อ\nยี่หุบ\nยึกยัก\nยึกยือ\nยึด\nยืด\nยืน\nยื่น\nยืม\nยื้อ\nยุ\nยุกกระบัตร\nยุกดิ\nยุกติ\nยุกติธรรม\nยุกต์\nยุค\nยุคนธร\nยุคล\nยุคันต์\nยุคันธร\nยุคุนธร\nยุง\nยุ่ง\nยุ้ง\nยุด\nยุต\nยุติ\nยุทธ\nยุทธ์\nยุทธนา\nยุทโธปกรณ์\nยุบ\nยุ่บ\nยุ่บยั่บ\nยุบล\nยุพดี\nยุพเรศ\nยุพา\nยุพาน\nยุพาพาล\nยุพาพิน\nยุ่มย่าม\nยุ่ย\nยุ้ย\nยุรยาตร\nยูรยาตร\nยุวชน\nยุวดี\nยุวราช\nยุวา\nยุวาน\nยู\nยู่\nยูง\nยูโด\nยูถะ\nยูถิกา\nยูริก\nยูเรนัส\nยูเรเนียม\nยูโรเพียม\nเย\nเย้\nเยง\nเยซู\nเย็ด\nเย็น\nเย็นตาโฟ\nเย็นเตาโฟ\nเย็บ\nเย้ย\nเยอ\nเย่อ\nเยอรมัน\nเยอว\nเย่อหยิ่ง\nเยอะ\nเยอะแยะ\nเยา\nเย้า\nเยาว์\nเยาวชน\nเยาวมาลย์\nเยาวยอด\nเยาวราช\nเยาวเรศ\nเยาวลักษณ์\nเยาวพา\nเยาวพาณี\nเยาวพาน\nเยาะ\nเยิง\nเยิน\nเยิ่น\nเยิ่นเย้อ\nเยินยอ\nเยิบ\nเยิบยาบ\nเยิ้ม\nเยีย\nเยี่ยง\nเยี่ยงอย่าง\nเยียงผา\nเยียดยัด\nเยียน\nเยียบ\nเยี่ยม\nเยียรบับ\nเยียรยง\nเยียว\nเยี่ยว\nเยียวยา\nเยือ\nเยื่อ\nเยื้อ\nเยือก\nเยือง\nเยื่อง\nเยื้อง\nเยือน\nเยื้อน\nแย่\nแย้\nแยก\nแยง\nแย่ง\nแย้ง\nแยงแย่\nแยงแย้\nแยบ\nแย็บ\nแยม\nแย้ม\nแยแส\nแยะ\nโย\nโย้\nโยก\nโยกเยก\nโยคาพจร\nโยคาวจร\nโยคเกณฑ์\nโยคยะ\nโยคะ\nโยคิน\nโยคี\nโยง\nโย่ง\nโย่งเย่ง\nโยงโย่\nโยชน์\nโยชนา\nโยถิกะ\nโยทะกา\nโยธวาทิต\nโยธา\nโยธิน\nโยน\nโยนก\nโยนิโส\nโยนี\nโยม\nโยโส\nใย\nไย\nไย่\nไยดี\nไยไพ\nรก\nรง\nรงค์\nรงควัตถุ\nรงรอง\nรจนา\nรจเรข\nรจเลข\nรจิต\nรชตะ\nรชนิ\nรชนี\nรชะ\nรณรงค์\nรด\nรดี\nรตนะ\nรตะ\nรติ\nรถ\nรน\nร่น\nรบ\nรบาญ\nรพี\nรม\nร่ม\nรมณี\nรมณีย์\nรมณียสถาน\nรมย์\nรมเยศ\nรยางค์\nรวก\nรวง\nร่วง\nรวด\nรวน\nรวนเร\nร่วน\nรวบ\nรวม\nร่วม\nรวย\nรวิ\nรวิวาร\nรวี\nรศนา\nรส\nรสนา\nรสสุคนธ์\nรสายนเวท\nรสิก\nรหัท\nรหัส\nรโห\nรโหฐาน\nรอ\nร่อ\nรอก\nรอง\nร่อง\nร้อง\nรองเง็ง\nร่องแร่ง\nรอด\nรอน\nร่อน\nร้อน\nรอบ\nรอบคอบ\nรอม\nรอมชอม\nรอมร่อ\nรอย\nร่อย\nร้อย\nร่อแร่\nระ\nระกะ\nระกา\nระกำ\nระเกะระกะ\nระคน\nระคาง\nระคาย\nระแคะ\nระฆัง\nระงม\nระงับ\nระแง้\nระโงกหิน\nระชวย\nระดม\nระดะ\nระดับ\nระดา\nระด่าว\nระดู\nระเด่น\nระเดียง\nระแด\nระตู\nระทก\nระทด\nระทม\nระทวย\nระทา\nระทึก\nระแทะ\nระนาด\nระนาบ\nระนาม\nระนาว\nระเนน\nระเนระนาด\nระเนียด\nระแนง\nระแนะ\nระบบ\nระบม\nระบอบ\nระบัด\nระบับ\nระบาด\nระบาย\nระบำ\nระบิล\nระบือ\nระบุ\nระเบง\nระเบ็ง\nระเบิด\nระเบียง\nระเบียน\nระเบียบ\nระแบบ\nระมัดระวัง\nระมาด\nระเมียร\nระย่อ\nระย่อม\nระยะ\nระยั้ง\nระยับ\nระย้า\nระยาบ\nระยำ\nระยิบระยับ\nระโยง\nระโยงระยาง\nระรวย\nระรอง\nระร่อน\nระรัว\nระราน\nระร่าย\nระริก\nระรี่\nระรึง\nระรื่น\nระรื้น\nระเร้ง\nระเริง\nระเรียง\nระเรื่อย\nระแรง\nระลวง\nระลอก\nระลึก\nระวัง\nระวาง\nระวาย\nระวิง\nระแวง\nระแวดระวัง\nระไว\nระส่ำระสาย\nระหกระเหิน\nระหง\nระหวย\nระหว่าง\nระหองระแหง\nระหัด\nระหาย\nระเห็จ\nระเหย\nระเหระหน\nระเหหน\nระเหิด\nระเหินระหก\nระแหง\nระโหย\nระอา\nระอิดระอา\nระอุ\nรัก\nรักข์\nรักขิต\nรักตะ\nรักบี้\nรักเร่\nรักแร้\nรักษ์\nรักษา\nรัง\nรั้ง\nรังเกียจ\nรังแก\nรังค์\nรังควาน\nรังแค\nรังรอง\nรังวัด\nรังสิ\nรังสี\nรังสิมันตุ์\nรังสิมา\nรัจฉา\nรัช\nรัชชูปการ\nรัชมังคลาภิเษก\nรัชชุ\nรัชฎาภิเษก\nรัชดาภิเษก\nรัชนะ\nรัชนี\nรัญจวน\nรัฏฐาภิปาลโนบาย\nรัฐ\nรัฐประศาสโนบาย\nรัฐประศาสนศาสตร์\nรัด\nรัต\nรัตกัมพล\nรัตมณี\nรัตคน\nรัตจันทน์\nรัตตัญญู\nรัตติ\nรัตน์\nรัตนะ\nรัตนโกสินทร์\nรัตนโกสินทรศก\nรัตนชาติ\nรัตนตรัย\nรัตนบัลลังก์\nรัตนวราภรณ์\nรัตนสิงหาสน์\nรัตนา\nรัตนากร\nรัตนาภรณ์\nรัตนาวลี\nรัตมา\nรัถ\nรัถยา\nรัทเทอร์ฟอร์เดียม\nรัน\nรั้น\nรันทด\nรันทวย\nรับ\nรัมณียสถาน\nรัมภา\nรัมมี่\nรัมย์\nรัย\nรัว\nรั่ว\nรั้ว\nรัศมิมัต\nรัศมิมาน\nรัศมี\nรัษฎากร\nรัสเซีย\nรัสสะ\nรัสสระ\nรา\nร่า\nร้า\nราก\nรากษส\nรากสาด\nราคะ\nราคจริต\nราคา\nราคิน\nราคี\nราง\nร่าง\nร้าง\nรางจืด\nรางชาง\nรางวัล\nราช\nราชกิจจานุเบกษา\nราชนิกุล\nราชวโรงการ\nราชญี\nราชดัด\nราชพฤกษ์\nราชมาณพ\nราชมาษ\nราชมาส\nราชย์\nราชสีห์\nราชะ\nราชัน\nราชันย์\nราชัย\nราชา\nราชาธิปไตย\nราชาธิราช\nราชาภิเษก\nราชายตนะ\nราชาวดี\nราชี\nราชินิกุล\nราชินีกุล\nราชินี\nราชินูปถัมภ์\nราชูปถัมภ์\nราชูปโภค\nราเชน\nราเชนทร์\nราเชนทรยาน\nราโชวาท\nราไชศวรรย์\nราญ\nราญรอน\nราด\nราต\nราตร\nราตรี\nราน\nร่าน\nร้าน\nราบ\nราพณ์\nราพณาสูร\nราม\nรามเกียรติ์\nรามสูร\nรามัญ\nรามา\nราย\nร่าย\nร้าย\nราว\nร้าว\nราวี\nราศี\nราษฎร\nราษฎร์\nราษตรี\nราษราตรี\nราหุ\nราหู\nรำ\nร่ำ\nรำคาญ\nรำงับ\nรำจวน\nรำบาญ\nรำพัน\nรำพาย\nรำพึง\nรำเพย\nรำไพ\nรำมะนา\nรำมะนาด\nรำมะร่อ\nร่ำรวย\nร่ำร่ำ\nรำไร\nรำลึก\nรำหัด\nรำหัส\nริ\nริก\nริดสีดวง\nริน\nริ้น\nริบ\nริบบิ้น\nริบรี่\nริบหรี่\nริปุ\nริปู\nริม\nริ้ว\nริษยา\nรี\nรี่\nรี้พล\nรีด\nรีดักชัน\nรีต\nรีเนียม\nรีบ\nรีม\nรีรอ\nรี้ริก\nรึง\nรึ้ง\nรื่น\nรื้น\nรื้อ\nรุ\nรุก\nรุกข์\nรุกขชาติ\nรุกขเทวดา\nรุกขมูล\nรุกขา\nรุกรุย\nรุ่ง\nรุ้ง\nรุงรัง\nรุ่งริ่ง\nรุจ\nรุจา\nรุจนะ\nรุจิ\nรุจี\nรุจิระ\nรุจิรา\nรุด\nรุต\nรุทธ์\nรุทระ\nรุธิร\nรุธิระ\nรุเธียร\nรุน\nรุ่น\nรุบรู่\nรุม\nรุ่ม\nรุ่มร่าม\nรุย\nรุ่ย\nรุรุ\nรุหะ\nรู\nรู่\nรู้\nรูจี\nรูด\nรูทีเนียม\nรูบิเดียม\nรูป\nรูปิยะ\nรูปี\nรูเล็ตต์\nเร่\nเรข\nเรขา\nเรขาคณิต\nเร็ง\nเร่ง\nเร้ง\nเรณุ\nเรณู\nเรดอน\nเรดาร์\nเรเดียม\nเร้น\nเรรวน\nเรไร\nเร็ว\nเร่ว\nเรวดี\nเรอ\nเร่อ\nเรา\nเร่า\nเร้า\nเราะ\nเริง\nเริด\nเริม\nเริ่ม\nเริ้ม\nเริศร้าง\nเรี่ย\nเรี้ย\nเรียก\nเรียง\nเรียด\nเรียน\nเรียบ\nเรียม\nเรี่ยม\nเรียว\nเรี่ยว\nเรี้ยวรก\nเรือ\nเรื่อ\nเรื้อ\nเรือก\nเรือง\nเรื่อง\nเรื้อง\nเรืองรอง\nเรือด\nเรือน\nเรื้อน\nเรื่อย\nแร\nแร่\nแรก\nแร็กเกต\nแรง\nแร่ง\nแร้ง\nแรด\nแร้นแค้น\nแรม\nแร้ว\nแระ\nโร\nโร่\nโรค\nโรคา\nโรคาพาธ\nโรง\nโรจ\nโรจน์\nโรเดียม\nโรตี\nโรท\nโรธ\nโรม\nโรมัน\nโรเมอร์\nโรย\nโรเร\nโรหิณี\nโรหิต\nไร\nไร่\nไร้\nไรย์\nฤกษ์\nฤกษณะ\nฤคเวท\nฤชา\nฤชุ\nฤณ\nฤดี\nฤดียา\nฤดู\nฤต\nฤติยา\nฤตุ\nฤทธา\nฤทธิ์\nฤทัย\nฤษภ\nฤษยา\nฤษี\nฤๅ\nฤๅดี\nฤๅทัย\nฤๅษี\nฤๅสาย\nลก\nล่ก\nลฆุ\nลง\nล่ง\nลงกา\nล้งเล้ง\nลด\nลดา\nลดาวัลย์\nลน\nล้น\nลบ\nลบอง\nลพ\nลพุช\nลม\nล่ม\nล้ม\nลมาด\nลรรลุง\nลลนา\nลลิต\nลวก\nลวง\nล่วง\nล้วง\nลวณะ\nลวด\nล้วน\nลวนลาม\nลวนะ\nล่วม\nลวะ\nลวิตร\nลหุ\nลหุกาบัติ\nล่อ\nล้อ\nลอก\nล็อก\nล็อกเกต\nลอกแลก\nลอการิทึม\nลอง\nล่อง\nลองกอง\nลองจิจูด\nลองไน\nลอด\nลอตเตอรี่\nลอน\nล่อน\nลอบ\nลอม\nล้อม\nลอมชอม\nลอมพอก\nลอย\nล่อย\nล่อแล่\nลอว์เรนเซียม\nลออ\nละ\nล่ะ\nละคร\nละติจูด\nละบม\nละบอง\nละบือ\nละเบ็ง\nละโบม\nละม่อม\nละมั่ง\nละมาน\nละม้าย\nละมุ\nละมุด\nละมุน\nละเมอ\nละเมาะ\nละเมิด\nละเมียด\nละแมะ\nละโมก\nละโมบ\nละไม\nละลวย\nละลอก\nละล้า\nละล้าละลัง\nละลาน\nละลาบละล้วง\nละลาย\nละล้าว\nละล่ำละลัก\nละลิบ\nละลุม\nละเลง\nละเล้า\nละเลาะ\nละเลิง\nละเลียด\nละเลียบ\nละไล้\nละว้า\nละวาด\nละเวง\nละแวก\nละโว้\nละหมาด\nละห้อย\nละหาน\nละหาร\nละหุ่ง\nละเหย\nละเหี่ย\nละอง\nละออง\nละอาย\nละเอียด\nละแอน\nลัก\nลักขณะ\nลักขณา\nลักขะ\nลักขี\nลักจั่น\nลักปิดลักเปิด\nลักษณ์\nลักษณนาม\nลักษณะ\nลักษณาการ\nลักษมณ์\nลักษมาณา\nลักษมี\nลักษะ\nลัคคะ\nลัคน์\nลัคนา\nลัง\nลั่ง\nลังกา\nลังคี\nลังถึง\nลังลอง\nลังเล\nลังสาด\nลัชชา\nลัชชี\nลัญจ์\nลัญจกร\nลัญฉกร\nลัญฉน์\nลัฐิ\nลัฐิกา\nลัด\nลัดา\nลัทธ์\nลัทธิ\nลัน\nลั่น\nลันเต\nลันเตา\nลันไต\nลั่นทม\nลันโทม\nลับ\nลัพธ์\nลัพธิ\nลัภ\nลัภนะ\nลัภย์\nลัมพ์\nลัย\nลา\nล่า\nล้า\nลาก\nลาง\nล่าง\nล้าง\nลางลิง\nลางสาด\nลาช\nลาชะ\nลาชา\nลาญ\nลาด\nลาดเลา\nล้าต้า\nล่าเตียง\nลาน\nล่าน\nล้าน\nลาบ\nลาพอน\nลาภ\nลาม\nล่าม\nลามก\nลาย\nล้าย\nลายสือ\nลาลา\nลาว\nลาวัณย์\nลาวา\nลำ\nล่ำ\nล้ำ\nลำเข็ญ\nลำแข\nลำเค็ญ\nลำเจียก\nลำดวน\nลำดับ\nลำเนา\nลำบอง\nลำบาก\nลำปำ\nลำพวน\nลำพอง\nลำพัง\nลำพู\nลำเพ็ญ\nลำเพา\nลำแพน\nลำโพง\nลำไพ่\nลำภุขัน\nลำมะลอก\nลำยอง\nลำไย\nลำลอง\nล่ำลา\nลำลาบ\nลำลึก\nลำเลาะ\nลำเลิก\nลำเลียง\nลำเวียง\nลำเอียก\nลำเอียง\nลิ\nลิกขา\nลิกไนต์\nลิกู\nลิเก\nลิขนะ\nลิขสิทธิ์\nลิขิต\nลิง\nลิงค์\nลิด\nลิต\nลิตมัส\nลิตร\nลิเทียม\nลิ่น\nลิ้น\nลินจง\nลิ้นจี่\nลินลา\nลินสีด\nลิ่นฮื้อ\nลินิน\nลิบ\nลิปดา\nลิปสติก\nลิปิ\nลิฟต์\nลิเภา\nลิ่ม\nลิ้ม\nลิมป์\nลิมปนะ\nลิลิต\nลิว\nลิ่ว\nลิสง\nลี\nลี่\nลี้\nลีซอ\nลีบ\nลีลา\nลีลาศ\nลีฬหา\nลึก\nลึงค์\nลืด\nลื่น\nลื้น\nลืบ\nลืม\nลือ\nลื่อ\nลื้อ\nลุ\nลุก\nลุง\nลุ้ง\nลุ่น\nลุ้น\nลุพธ์\nลุ่ม\nลุมพี\nลุมพู\nลุย\nลุ่ย\nลุ้ย\nลู่\nลูก\nลูกระมาศ\nลูกเอ็น\nลูขะ\nลูทีเชียม\nลูบ\nเลก\nเล็ก\nเลข\nเลขา\nเลขาธิการ\nเลขานุการ\nเล็ง\nเล้ง\nเล่งฮื้อ\nเลเซอร์\nเลฑฑุ\nเลณฑุ\nเลณะ\nเล็ด\nเลน\nเล็น\nเล่น\nเลนส์\nเล็บ\nเลบง\nเลปกร\nเลปน์\nเลเป\nเลเพ\nเล็ม\nเล่ม\nเลย\nเลว\nเลวง\nเลวูโลส\nเลศ\nเลษฏุ\nเล่ห์\nเล่ห์กระเท่ห์\nเลหลัง\nเลหะ\nเลอ\nเล่อ\nเลอะ\nเลอะเทอะ\nเละ\nเละเทะ\nเลา\nเล่า\nเล้า\nเลากัย\nเล้าโลม\nเลาะ\nเลิก\nเลิ่กลั่ก\nเลิง\nเลิ้ง\nเลินเล่อ\nเลิศ\nเลีย\nเลียง\nเลี่ยง\nเลี้ยง\nเลียงผา\nเลียงฝ้าย\nเลียงมัน\nเลียน\nเลี่ยน\nเลียนไฟ\nเลียบ\nเลี่ยม\nเลียว\nเลี้ยว\nเลือก\nเลือง\nเลื่อง\nเลือด\nเลือน\nเลื่อน\nเลื่อม\nเลื่อย\nเลื้อย\nเลื่อยล้า\nแล\nแล่\nแล้\nแลก\nแล็กเกอร์\nแล็กโทส\nแลง\nแล่ง\nแล้ง\nแลน\nแล่น\nแลนทานัม\nแลบ\nแล้ว\nและ\nโล่\nโล้\nโลก\nโลกเชษฐ์\nโลกธรรม\nโลกธาตุ\nโลกนาถ\nโลกบาล\nโลกย์\nโลกัย\nโลกวัชชะ\nโลกวิทู\nโลกัตถจริยา\nโลกันตร์\nโลกา\nโลกาธิบดี\nโลกาธิปไตย\nโลกานุวัตร\nโลกาภิวัตน์\nโลกามิส\nโลกายัต\nโลกาวินาศ\nโลกิยะ\nโลกีย์\nโลกียวัตร\nโลกียวิสัย\nโลกียสุข\nโลกุตระ\nโลกุตรธรรม\nโลกุตรภูมิ\nโลง\nโล่ง\nโล้ง\nโล่งโจ้ง\nโล่งโต้ง\nโล้งโต้ง\nโลจนะ\nโลณะ\nโลด\nโล่ติ๊น\nโลโต\nโลท\nโลน\nโล้น\nโลภ\nโลม\nโลมเล้า\nโลมะ\nโลมา\nโลลุป\nโลเล\nโลโล\nโลโล้\nโลหะ\nโลหกุมภี\nโลหัช\nโลหิต\nไล่\nไล้\nไลย\nไลลา\nไล่เลี่ย\nฦๅ\nฦๅชา\nฦๅสาย\nวก\nวง\nวงก์\nวงกต\nวงศ์\nวงศกร\nวงศา\nวงษ์\nวจนะ\nวจี\nวชิระ\nวชิรปาณี\nวชิรหัตถ์\nวชิราวุธ\nวฏะ\nวฏาการ\nวณิช\nวณิชชา\nวณิชย์\nวณิชยา\nวณิพก\nวดี\nวทนะ\nวทัญญุตา\nวทัญญู\nวธุกา\nวธู\nวน\nวนศาสตร์\nวนสณฑ์\nวนสัณฑ์\nวนอุทยาน\nวนัส\nวนัสบดี\nวนา\nวนาดร\nวนาดอน\nวนานต์\nวนาลัย\nวนาลี\nวนาวาส\nวนาศรม\nวนาสณฑ์\nวนาสัณฑ์\nวนิดา\nวนิพก\nวเนจร\nวโนทยาน\nวยัคฆ์\nวยากรณ์\nวรดนู\nวรทาน\nวรมหาวิหาร\nวรงค์\nวรณะ\nวรรค\nวรรคย์\nวรรช\nวรรชย์\nวรรณะ\nวรรณกรรม\nวรรณคดี\nวรรณยุกต์\nวรรณยุต\nวรรณศิลป์\nวรรณนา\nวรรณพฤติ\nวรรณึก\nวรรธกะ\nวรรธนะ\nวรรษ\nวรรษา\nวรวิหาร\nวรัญญู\nวรางคณา\nวรางคนา\nวราห์\nวราหะ\nวรุณ\nวโรดม\nวฤก\nวลัช\nวลัญช์\nวลัญชน์\nวลัย\nวลาหก\nวลี\nวศค\nวศะ\nวศิน\nวสนะ\nวสภะ\nวสละ\nวสวัดดี\nวสวัตตี\nวสะ\nวสันต์\nวสันตดิลก\nวสันตฤดู\nวสันตวิษุวัต\nวสา\nวสี\nวสุ\nวสุธา\nวสุนธรา\nวสุมดี\nวหะ\nวอ\nวอก\nวอกแวก\nว่องไว\nวอด\nวอน\nว่อน\nว็อบ\nวอมแวม\nวอลเลย์บอล\nวอแว\nวะ\nวัก\nวักกะ\nวัคคีย์\nวัคคุ\nวัคซีน\nวัง\nวังก์\nวังชา\nวังเวง\nวังศะ\nวังสะ\nวัจจะ\nวัจกุฎี\nวัจฉละ\nวัจน์\nวัช\nวัชชะ\nวัชพืช\nวัชฌ์\nวัชระ\nวัชรปาณี\nวัชรยาน\nวัชรอาสน์\nวัชราสน์\nวัชรินทร์\nวัชรี\nวัชเรนทร์\nวัฏ\nวัฏฏะ\nวัฏจักร\nวัฏทุกข์\nวัฏสงสาร\nวัฏกะ\nวัฏฏิ\nวัฒกะ\nวัฒกี\nวัฒนธรรม\nวัฒนะ\nวัฒนา\nวัณ\nวัณโรค\nวัณฏ์\nวัณณะ\nวัณนา\nวัด\nวัต\nวัตต์\nวัตตา\nวัตถ์\nวัตถาภรณ์\nวัตถาลังการ\nวัตถุ\nวัตนะ\nวัตร\nวัตสดร\nวัตสะ\nวัติ\nวัทน์\nวัน\nวันต์\nวันทนา\nวันทนาการ\nวันทนีย์\nวันทยหัตถ์\nวันทยาวุธ\nวันทา\nวันทิ\nวับ\nวับวาบ\nวับวาม\nวับแวบ\nวับแวม\nวัปปะ\nวัมมิกะ\nวัย\nวัลก์\nวัลคุ\nวัลย์\nวัลลภ\nวัลลี\nวัว\nวัสสะ\nวัสโสทก\nวัสดุ\nวัสตร์\nวัสน์\nวัสนะ\nวัสสานะ\nวัสสานฤดู\nวา\nว่า\nว้า\nว้าเหว่\nวาก\nว้าก\nวากยสัมพันธ์\nวากยะ\nวาง\nว่าง\nว้าง\nวาจก\nวาจา\nวาจาไปยะ\nวาจาล\nวาชเปยะ\nวาณิช\nวาณิชกะ\nวาณิชย์\nวาณี\nวาด\nวาต\nวาตะ\nวาตภัย\nวาท\nวาทศาสตร์\nวาทศิลป์\nวาทกะ\nวาทนะ\nวาทย์\nวาทยกร\nวาทิต\nวาทิน\nวาที\nวาน\nวานซืน\nว่าน\nวานร\nวานรินทร์\nวาเนเดียม\nวาบ\nวาปี\nวาม\nวามน\nวามนาวตาร\nวามะ\nวาย\nว่าย\nว้าย\nวายะ\nวาโย\nวายามะ\nวายุ\nวายุกูล\nวาร\nวาระ\nวารสาร\nวารสารศาสตร์\nวาริ\nวารี\nวาริช\nวารีช\nวาริท\nวาริธร\nวารุณ\nวารุณี\nวาล\nวาลวีชนี\nวาล์ว\nวาลิกา\nวาลุกา\nวาว\nว่าว\nว้าว่อน\nว้าวุ่น\nวาสนะ\nวาสนา\nวาสพ\nวาสะ\nวาสิน\nวาสี\nวาสุกรี\nวาสุกี\nวาสุเทพ\nวาหนะ\nวาหะ\nวาหินี\nวาฬ\nวิกขัมภ์\nวิกขัมภนะ\nวิกเขป\nวิกรม\nวิกรัย\nวิกรานต์\nวิกฤต\nวิกฤติ\nวิกล\nวิกสิต\nวิกัต\nวิกัติ\nวิกัติการก\nวิกัป\nวิกัย\nวิการ\nวิกาล\nวิกาลโภชน์\nวิคหะ\nวิเคราะห์\nวิฆเนศ\nวิฆเนศวร\nวิฆาต\nวิง\nวิ่ง\nวิ่งเปี้ยว\nวิงวอน\nวิจฉิกะ\nวิจล\nวิจักขณ์\nวิจักษ์\nวิจักษณ์\nวิจัย\nวิจาร\nวิจารณ์\nวิจารณญาณ\nวิจิ\nวิจิกิจฉา\nวิจิต\nวิจิตร\nวิจิน\nวิจุณ\nวิจุรณ\nวิชชา\nวิชชุ\nวิชชุดา\nวิชชุตา\nวิชชุลดา\nวิชญะ\nวิชน\nวิชนี\nวิชย\nวิชัย\nวิชา\nวิชานนะ\nวิชิต\nวิเชียร\nวิญญัตติ\nวิญญาณ\nวิญญาณกทรัพย์\nวิญญู\nวิฑูรย์\nวิด\nวิตก\nวิตถาร\nวิตามิน\nวิถี\nวิทธะ\nวิทยฐานะ\nวิทยา\nวิทยาคม\nวิทยาคาร\nวิทยาลัย\nวิทยุ\nวิทยุต\nวิทวัส\nวิทัตถิ\nวิทัศน์\nวิทารณ์\nวิทิต\nวิทู\nวิทูร\nวิเทศ\nวิเทโศบาย\nวิธ\nวิธวา\nวิธาน\nวิธี\nวิธุระ\nวิธู\nวิธูปนะ\nวิ่น\nวินตกะ\nวินัย\nวินาที\nวินายก\nวินาศ\nวินิจ\nวินิจฉัย\nวินิต\nวินิบาต\nวินิปาติก\nวิเนต\nวิบัติ\nวิบาก\nวิบุล\nวิบุลย์\nวิบูล\nวิบูลย์\nวิปการ\nวิปฏิสาร\nวิปโยค\nวิประโยค\nวิปริต\nวิปลาส\nวิปวาส\nวิปักษ์\nวิปัสสก\nวิปัสสนา\nวิปัสสนายานิก\nวิพากษ์\nวิพิธทัศนา\nวิพุธ\nวิภว\nวิภวตัณหา\nวิภังค์\nวิภัช\nวิภัตติ\nวิภา\nวิภาค\nวิภาช\nวิภาดา\nวิภาวี\nวิภาษ\nวิภาส\nวิภู\nวิภูษณะ\nวิภูษา\nวิภูษิต\nวิมน\nวิมล\nวิมลัก\nวิมังสา\nวิมัติ\nวิมาน\nวิมุข\nวิมุต\nวิมุตติ\nวิเมลือง\nวิโมกข์\nวิโยค\nวิระ\nวิรงรอง\nวิรังรอง\nวิรัช\nวิรัต\nวิรัติ\nวิราคะ\nวิราม\nวิริยภาพ\nวิริยะ\nวิรุธ\nวิรุฬห์\nวิรุฬหก\nวิรูป\nวิรูปักษ์\nวิเรนทร์\nวิโรจ\nวิโรจน์\nวิโรฒ\nวิโรธ\nวิลันดา\nวิลัย\nวิลาด\nวิลาศ\nวิลาป\nวิลาวัณย์\nวิลาส\nวิลาสินี\nวิลิปดา\nวิลิศมาหรา\nวิเลป\nวิเลปนะ\nวิโลกนะ\nวิโลม\nวิไล\nวิไลวรรณ\nวิวรณ์\nวิวรรธน์\nวิวัฏ\nวิวัฒน์\nวิวัฒนาการ\nวิวัน\nวิวาท\nวิวาห์\nวิวาหมงคล\nวิวาหะ\nวิวิต\nวิวิธ\nวิเวก\nวิศรุต\nวิศว\nวิศวกร\nวิศวกรรม\nวิศวกรรมศาสตร์\nวิศัลย์\nวิศาขบูชา\nวิศาขา\nวิศาล\nวิศิษฏ์\nวิศุทธ์\nวิศุทธิ์\nวิเศษ\nวิเศษณ์\nวิษณุ\nวิษณุกรรม\nวิษธร\nวิษัย\nวิษาณ\nวิษุวัต\nวิสกี้\nวิสรรชนีย์\nวิสฤต\nวิสสุกรรม\nวิสัชนา\nวิสัญญี\nวิสัย\nวิสัยทัศน์\nวิสาขบูชา\nวิสาขะ\nวิสาขา\nวิสามัญ\nวิสามานยนาม\nวิสาร\nวิสารทะ\nวิสาล\nวิสาสะ\nวิสาหกิจ\nวิสิฐ\nวิสุงคามสีมา\nวิสุทธ์\nวิสุทธิ์\nวิสูตร\nวิเสท\nวิหค\nวิหลั่น\nวิหาร\nวิหิงสา\nวิเหสา\nวิฬาร\nวิฬาร์\nวี\nวีจิ\nวีชนี\nวีณา\nวี้ด\nวีรกรรม\nวีรชน\nวีรบุรุษ\nวีรสตรี\nวี่วัน\nวี่แวว\nวีสะ\nวุ้ง\nวุฐิ\nวุฒ\nวุฒิ\nวุด\nวุ่น\nวุ้น\nวุบ\nวุ้ย\nวุลแฟรม\nวู้\nวูดวาด\nวูบ\nวู่วาม\nเว้\nเวค\nเวคิน\nเวคี\nเวจ\nเวช\nเวชยันต์\nเวฐน์\nเวณิ\nเวณิก\nเวณุ\nเวตน์\nเวตร\nเวตาล\nเวท\nเวทคู\nเวทนา\nเวทย์\nเวทัลละ\nเวทางค์\nเวทางคศาสตร์\nเวทานต์\nเวทานตะ\nเวทิ\nเวที\nเวธะ\nเวน\nเว้น\nเวนไตย\nเวไนย\nเวมัต\nเว้ย\nเวยยากรณะ\nเวร\nเวรมณี\nเวรี\nเวโรจน์\nเวลา\nเวเลนซี\nเวศม์\nเวศย์\nเวศยา\nเวสน์\nเวสภู\nเวสม์\nเวสวัณ\nเวสสะ\nเวสสันดร\nเวสสุกรรม\nเวสสุวัณ\nเวสารัช\nเวสิยา\nเวหน\nเวหะ\nเวหา\nเวหาส\nเวฬุ\nเวฬุริยะ\nเว่อ\nเว้า\nเวิก\nเวิ้ง\nเวี่ย\nเวียง\nเวียด\nเวียดนาม\nเวียน\nเวียร\nเวี่ยว\nแว้\nแวง\nแว้ง\nแวด\nแว้ด\nแวน\nแว่น\nแวนดา\nแวบ\nแว็บ\nแวม\nแว็ม\nแวว\nแว่ว\nแวะ\nโว\nโว่\nโวการ\nโว่ง\nโวทาน\nโวย\nโว้ย\nโว้เว้\nโวหาร\nไว\nไว้\nไวกูณฐ์\nไวฑูรย์\nไวทย์\nไวน์\nไวพจน์\nไวยากรณ์\nไวยาวัจกร\nไวยาวัจมัย\nไวรัส\nไววรรณ\nไวษณพ\nไวโอลิน\nศก\nศกุน\nศกุนต์\nศกุนิ\nศกุนี\nศจี\nศตะ\nศตภิษัช\nศตวรรษ\nศตพรรษ\nศตกะ\nศนิ\nศพ\nศมนะ\nศมะ\nศยาม\nศยามล\nศร\nศรายุธ\nศราวรณ์\nศรรกรา\nศรวณะ\nศรวณีย์\nศรวิษฐา\nศรัณย์\nศรัณยู\nศรัท\nศรัทธา\nศรัย\nศราทธ์\nศราทธพรต\nศราพก\nศราวก\nศราวณะ\nศรี\nศรีตรัง\nศรุติ\nศฤคาล\nศฤงค์\nศฤงคาร\nศฤงคาริน\nศฤงคารี\nศลิษฏ์\nศลิษา\nศวะ\nศวัส\nศวา\nศวาน\nศศะ\nศศธร\nศศพินทุ์\nศศลักษณ์\nศศิ\nศศิน\nศศี\nศศิขัณฑ์\nศศิธร\nศศิมณฑล\nศศิวิมล\nศอ\nศอก\nศักดา\nศักดิ\nศักดิ์\nศักดินา\nศักติ\nศักย\nศักยภาพ\nศักย์\nศักยะ\nศักร\nศักรินทร์\nศักเรนทร์\nศักราช\nศังกร\nศัตรู\nศันสนะ\nศันสนีย์\nศัพท์\nศัยยา\nศัล\nศัลย์\nศัลยกรรม\nศัลยแพทย์\nศัลยศาสตร์\nศัสดร\nศัสตร\nศัสตรศาสตร์\nศัสตรา\nศัสตราวุธ\nศากตะ\nศากย\nศากยะ\nศากยพุทธ\nศากยมุนี\nศาฎก\nศาณ\nศานต์\nศานติ\nศาป\nศารท\nศารทูล\nศาริกา\nศาล\nศาลา\nศาศวัต\nศาสดา\nศาสตร์\nศาสตรา\nศาสตราจารย์\nศาสนา\nศาสนกิจ\nศาสนจักร\nศาสนธรรม\nศาสนบุคคล\nศาสนพิธี\nศาสนวัตถุ\nศาสนศาสตร์\nศาสนสถาน\nศาสนสมบัติ\nศาสนิกชน\nศาสนีย์\nศาสนูปถัมภก\nศาสน์\nศิกษก\nศิการ\nศิขร\nศิขริน\nศิขรี\nศิขัณฑ์\nศิคาล\nศิงขร\nศิงขริน\nศิตะ\nศิถี\nศิพิระ\nศิระ\nศิรประภา\nศิราภรณ์\nศิโรรัตน์\nศิโรเวฐน์\nศิรา\nศิรามพุช\nศิโรราบ\nศิลป\nศิลป์\nศิลปะ\nศิลปกร\nศิลปกรรม\nศิลปกิจ\nศิลปวัตถุ\nศิลปวิทยา\nศิลปศาสตร์\nศิลปศึกษา\nศิลปหัตถกรรม\nศิลปิน\nศิลปี\nศิลา\nศิวะ\nศิวโมกข์\nศิวลึงค์\nศิวเวท\nศิวาลัย\nศิศีระ\nศิษฎิ\nศิษฏ์\nศิษย์\nศิษยานุศิษย์\nศีขร\nศีต\nศีตกาล\nศีรษะ\nศีล\nศึก\nศึกษา\nศึกษาธิการ\nศึกษานิเทศก์\nศุกร์\nศุกรวรรณ\nศุกรวาร\nศุกระ\nศุกล\nศุกลปักษ์\nศุจิ\nศุทธะ\nศุทธิ\nศุนะ\nศุนิ\nศุภกร\nศุภเคราะห์\nศุภนิมิต\nศุภมัสดุ\nศุภมาตรา\nศุภมาส\nศุภอักษร\nศุภางค์\nศูกร\nศุลกากร\nศุลการักษ์\nศุลี\nศุษิร\nศูทร\nศูนย์\nศูนยวาท\nศูละ\nศูลิน\nเศรณี\nเศรษฐ\nเศรษฐ์\nเศรษฐกิจ\nเศรษฐศาสตร์\nเศรษฐี\nเศร้า\nเศลษ\nเศวต\nเศวตร\nเศวตัมพร\nเศษ\nเศาจ\nเศาร์\nเศารยะ\nเศิก\nเศียร\nโศก\nโศกา\nโศกาดูร\nโศกาลัย\nโศกี\nโศจิ\nโศธนะ\nโศภน\nโศภะ\nโศภา\nโศภิต\nโศภิน\nโศภิษฐ์\nโศภี\nโศรดา\nโศรตร\nโศลก\nไศล\nไศวะ\nษมา\nษัฏ\nษัฑ\nษัณ\nษัษ\nษัษฐะ\nษัษฐี\nโษฑศัน\nสก\nสกวาที\nสกฏะ\nสกทาคามิผล\nสกิทาคามิผล\nสกทาคามิมรรค\nสกิทาคามิมรรค\nสกทาคามี\nสกิทาคามี\nสกนธ์\nสกปรก\nสกรณีย์\nสกรรจ์\nสกรรมกริยา\nสกล\nสกลมหาสังฆปริณายก\nสกัด\nสกา\nสกาว\nสกี\nสกุณ\nสกุณา\nสกุณี\nสกุน\nสกุนต์\nสกุล\nสเกต\nสแกนเดียม\nสขะ\nสง\nส่ง\nสงกร\nสงกรานต์\nสงกา\nสงค์\nสงคร\nสงคราม\nสงเคราะห์\nสงฆ์\nสงบ\nสงวน\nส่งสการ\nสงสัย\nสงสาร\nสงสารวัฏ\nสงัด\nสง่า\nสฐะ\nสณฑ์\nสด\nสดมภ์\nสดับ\nสดับปกรณ์\nสดำ\nสดุดี\nสตะ\nสตน\nสตภิสชะ\nสตรอนเชียม\nสตริกนิน\nสตรี\nสตัฟฟ์\nสตัมภ์\nสตางค์\nสติ\nสติปัฏฐาน\nสตี\nสตู\nสตูป\nสเต๊ก\nสถบดี\nสถล\nสถวีระ\nสถาน\nสถานะ\nสถานี\nสถาบัน\nสถาปนา\nสถาปนิก\nสถาปัตยกรรม\nสถาปัตยกรรมศาสตร์\nสถาปัตยเรขา\nสถาปัตยเวท\nสถาพร\nสถาวร\nสถิต\nสถิตยศาสตร์\nสถิติ\nสถิร\nสถีรวาท\nสถุล\nสถูป\nสทิง\nสทึง\nสทุม\nสธนะ\nสาธุสะ\nสน\nส้น\nสนทนา\nสนทรรศ\nสนทรรศน์\nสนเทศ\nสนเท่ห์\nสนธยา\nสนธิ\nสนน\nสนม\nสนวน\nสนอง\nสนอบ\nสนอม\nสนะ\nสนัด\nสนั่น\nสนับ\nสนับทึบ\nสนับสนุน\nสนาน\nสนาม\nสนายุ\nสนิกะ\nสนิท\nสนิธ\nสนิม\nสนุก\nสนุกเกอร์\nสนุข\nสนุต\nสนุ่น\nสบ\nสบง\nสบถ\nสบัน\nสบาย\nสบู่\nสไบ\nสปริง\nสปอร์\nสปาเกตตี\nสเปกตรัม\nสเปกโทรสโกป\nสไปริลลัม\nสพาบ\nสภา\nสภาพ\nสภาวการณ์\nสภาวะ\nสม\nสมการ\nสมจารี\nสมดุล\nสมมูล\nส้ม\nสมญา\nสมณะ\nสมณบริขาร\nสมณศักดิ์\nสมณสารูป\nสมเด็จ\nสมถะ\nสมถยานิก\nสมถวิปัสสนา\nสมนาคุณ\nสมบัติ\nสมบุกสมบัน\nสมบูรณ์\nสมบูรณาญาสิทธิราชย์\nสมประดี\nสมปฤดี\nสมปฤๅดี\nส้มป่อย\nสมปัก\nสมผุส\nสมพง\nสมพงศ์\nสมพล\nสมพัตสร\nสมพาส\nสมเพช\nสมโพธน์\nสมโพธิ\nสมภพ\nสมภาร\nสมโภค\nสมโภช\nสมมต\nสมมติ\nสมมุติ\nสมมาตร\nส้มมือ\nสมโมท\nสมโยค\nสมร\nสมรด\nสมรรถ\nสมรรถนะ\nสมรรถภาพ\nสมรส\nสมฤดี\nสมฤติ\nสมวายะ\nสมเสร็จ\nสมอ\nสมอง\nสมะ\nสมัคร\nสมังคี\nสมัช\nสมัชชา\nสมัญญา\nสมัต\nสมัน\nสมันต์\nสมัย\nสมา\nสมาคม\nสมาจาร\nสมาชิก\nสมาทาน\nสมาธิ\nสมาน\nสมานฉันท์\nสมาบัติ\nสมาพันธรัฐ\nสมาส\nสม่ำเสมอ\nสมิง\nสมิต\nสมิติ\nสมิทธ์\nสมิทธิ\nสมี\nสมุก\nสมุจจัย\nสมุจเฉท\nสมุฏฐาน\nสมุด\nสมุทร\nสมุทรโคดม\nสมุทัย\nสมุน\nสมุนไพร\nสมุลแว้ง\nสมุห\nสมุห์\nสมุหกลาโหม\nสมุหเทศาภิบาล\nสมุหนาม\nสมุหนายก\nสโมธาน\nสโมสร\nสยด\nสยนะ\nสยบ\nสยมพร\nสยมภู\nสยอง\nสยอน\nสยัมวรา\nสยาม\nสยามานุสติ\nสยามินทร์\nสยาย\nสยิว\nสยิ้ว\nสยุมพร\nสยุมภู\nสร\nสรง\nสร่ง\nสรณะ\nสรณคมน์\nสรณาคมน์\nสรณตรัย\nสรตะ\nสรทะ\nสรนุก\nสรเนาะ\nสรไน\nสรเพชญ\nสรภะ\nสรภัญญะ\nสรภู\nสรม\nสรร\nสรรค์\nสรรพ\nสรรพคุณ\nสรรพนาม\nสรรพสามิต\nสรรพัชญ\nสรรพากร\nสรรพางค์\nสรรเพชญ\nสรรเพชุดา\nสรรเสริญ\nสรลอน\nสรเลข\nสรวง\nสรวม\nสรวล\nสรเสริญ\nสร้อย\nสระ\nสระกอ\nสระท้อน\nสระพรั่ง\nสระอาด\nสรั่ง\nสรัสวดี\nสร่าง\nสร้าง\nสราญ\nสรี้\nสรีระ\nสรีรกิจ\nสรีรธาตุ\nสรีรวิทยา\nสรีรศาสตร์\nสรีรังคาร\nสรีรางคาร\nสรุป\nสโรช\nสโรชะ\nสฤก\nสฤต\nสฤษฎิ\nสฤษฎี\nสฤษฏ์\nสฤษดิ์\nสลด\nสลบ\nสลวน\nสลวย\nสลอด\nสลอน\nสลอย\nสละ\nสลัก\nสลัด\nสลัดได\nสลับ\nสลัว\nสลา\nสลาก\nสลาง\nสล้าง\nสลาด\nสลาตัน\nสลาบ\nสลาย\nสลิด\nสลิล\nสลึก\nสลึง\nสลุต\nสลุบ\nสลุมพร\nสแลง\nสวการย์\nสวภาพ\nสวราชย์\nสวก\nส้วง\nสวด\nสวน\nสวนะ\nสวนาการ\nส่วน\nสวนิต\nสวบ\nสวม\nส้วม\nสวย\nส่วย\nส้วย\nสวยม\nสวรรค\nสวรรค์\nสวรรคต\nสวรรคาลัย\nสวรรยา\nสวระ\nสวะ\nสวัสดิ\nสวัสดิ์\nสวัสดิการ\nสวัสดิภาพ\nสวัสดิมงคล\nสวัสดี\nสวัสติ\nสวาตี\nสวัสติกะ\nสวา\nสวาปาม\nสวาคตะ\nสวาง\nสว่าง\nสวาด\nสวาดิ\nสวาท\nสว่าน\nสว้าน\nสวาบ\nสวามิ\nสวามี\nสวามินี\nสวาย\nสวาสดิ์\nสวาหะ\nสวิง\nสวิญญาณกทรัพย์\nสวิตช์\nสสาร\nสสุระ\nสสุรี\nสหกรณ์\nสหการ\nสหจร\nสหชาต\nสหชาติ\nสหธรรม\nสหธรรมิก\nสหประชาชาติ\nสหพันธ์\nสหพันธรัฐ\nสหภาพ\nสหศึกษา\nสหัช\nสหัมบดี\nสหัส\nสหัสสะ\nสหัสธารา\nสหัสนัยน์\nสหัสเนตร\nสหัสรังสี\nสหัสา\nสหาย\nสอ\nส่อ\nสอง\nส่อง\nส้อง\nสอด\nสอน\nส่อน\nสอบ\nสอพลอ\nส้อม\nสอย\nสะ\nสะกด\nสะกอ\nสะกาง\nสะการะ\nสะกิด\nสะกิดสะเกา\nสะเก็ด\nสะแก\nสะคร้อ\nสะคราญ\nสะค้าน\nสะเงาะสะแงะ\nสะดม\nสะดวก\nสะดิ้ง\nสะดึง\nสะดือ\nสะดุ้ง\nสะดุด\nสะเด็ด\nสะเดา\nสะเดาะ\nสะตอ\nสะตาหมัน\nสะตึ\nสะตือ\nสะตุ\nสะเต๊ะ\nสะโตก\nสะทก\nสะท้อน\nสะท้าน\nสะทึก\nสะเทิน\nสะเทิ้น\nสะเทือน\nสะเทื้อน\nสะบะ\nสะบัก\nสะบักสะบอม\nสะบัด\nสะบัดสะบิ้ง\nสะบั้น\nสะบันงา\nสะบ้า\nสะบู\nสะแบง\nสะเปะสะปะ\nสะพรั่ง\nสะพรึงกลัว\nสะพรึบ\nสะพรึ่บ\nสะพัก\nสะพัง\nสะพัด\nสะพั้น\nสะพาน\nสะพาย\nสะเพร่า\nสะโพก\nสะเภา\nสะใภ้\nสะโมง\nสะระตะ\nสะระแหน่\nสะลาง\nสะลาบ\nสะลึมสะลือ\nสะวี้ดสะว้าด\nสะสม\nสะสวย\nสะสาง\nสะเหล่อ\nสะอาง\nสะอาด\nสะอ้าน\nสะอิ้ง\nสะอิดสะเอียน\nสะอึก\nสะอื้น\nสะเอ้ง\nสะเอว\nสะเออะ\nสะโอดสะอง\nสะไอ\nสัก\nสักกะ\nสักยะ\nสักกัจจะ\nสักกายทิฐิ\nสักการ\nสักการะ\nสักขี\nสักวา\nสักหลาด\nสัค\nสัคคะ\nสั่ง\nสังกร\nสังกรณี\nสังกรประโยค\nสังกะตัง\nสังกะวัง\nสังกะวาด\nสังกะสี\nสังกัด\nสังกัปปะ\nสังกา\nสังการ\nสังกาศ\nสังกิเลส\nสังเกต\nสังข์\nสังขกร\nสังขตธรรม\nสังขตะ\nสังขยา\nสังขลิก\nสังขลิกา\nสังขาร\nสังขารา\nสังเขป\nสังค์\nสังคญาติ\nสังคม\nสังคหะ\nสังคัง\nสังคายนา\nสังคายนาย\nสังคีต\nสังคีติ\nสังเค็ด\nสังเคราะห์\nสังฆกรรม\nสังฆการี\nสังฆเถระ\nสังฆทาน\nสังฆนายก\nสังฆปาโมกข์\nสังฆภัต\nสังฆเภท\nสังฆมณฑล\nสังฆมนตรี\nสังฆราช\nสังฆสภา\nสังฆาณัติ\nสังฆาฏิ\nสังฆาทิเสส\nสังฆาธิการ\nสังฆานุสติ\nสังฆาวาส\nสังยุตนิกาย\nสังโยค\nสังโยชน์\nสังวร\nสังวัจฉระ\nสังวัธยาย\nสังวาล\nสังวาส\nสังเวคะ\nสังเวช\nสังเวชนียสถาน\nสังเวย\nสังเวียน\nสังสกฤต\nสังสการ\nสังสนทนา\nสั่งสนทนา\nสังสรรค์\nสังสารวัฏ\nสังสิทธิ\nสังสุทธ์\nสังสุทธิ\nสังหร\nสังหรณ์\nสังหาร\nสังหาริมทรัพย์\nสังหาริมะ\nสังหิต\nสัจ\nสัจกิริยา\nสัจจะ\nสัจญาณ\nสัจธรรม\nสัจนิยม\nสัจพจน์\nสัชฌะ\nสัชฌุ\nสัญจร\nสัญเจตนา\nสัญชาตญาณ\nสัญชาติ\nสัญฌา\nสัญญา\nสัญญาณ\nสัญญี\nสัญโญชน์\nสัญนิยม\nสัญประกาศ\nสัญลักษณ์\nสัฐิ\nสัณฐาน\nสัณฐิติ\nสัณฑ์\nสัณห์\nสัด\nสัดจอง\nสัต\nสัตตะ\nสัตตาหกรณียะ\nสัตตาหกาลิก\nสัตมวาร\nสัตสดก\nสัตตบงกช\nสัตตบรรณ\nสัตตบุษย์\nสัตตู\nสัตถันดร\nสัตถา\nสัตถิ\nสัตถุ\nสัตถุศาสนา\nสัตบรรณ\nสัตย์\nสัตยพรต\nสัตยวาที\nสัตยาเคราะห์\nสัตยาธิษฐาน\nสัตยาบัน\nสัตว์\nสัตวชาติ\nสัตวบาล\nสัตวแพทย์\nสัตววิทยา\nสัตวา\nสัทธรรม\nสัทธา\nสัทธาจริต\nสัทธาธิกะ\nสัทธินทรีย์\nสัทธิงวิหาริก\nสัทธิวิหาริก\nสัทวิทยา\nสัทศาสตร์\nสัทอักษร\nสัน\nสั่น\nสั้น\nสันดาน\nสันดาป\nสันโดษ\nสันต์\nสันตติ\nสันตะปาปา\nสันตะวา\nสันติ\nสันตุฏฐี\nสันถวไมตรี\nสันถวะ\nสันถัต\nสันถาร\nสันทนะ\nสันทะ\nสันทัด\nสันทัสนะ\nสันทาน\nสันทิฐิก\nสันทิส\nสันเทหะ\nสันธาน\nสันนิธิ\nสันนิบาต\nสันนิวาส\nสันนิษฐาน\nสันสกฤต\nสับ\nสับปลับ\nสับปลี้\nสับปะรด\nสัปคับ\nสัปดาห์\nสัปดาหะ\nสัปดน\nสัปตศก\nสัปทน\nสัปปะ\nสัปปิ\nสัปปุริส\nสัปปุรุษ\nสัประยุทธ์\nสัปหงก\nสัปเหร่อ\nสัพ\nสัพพะ\nสัพพัญญู\nสัพเพเหระ\nสัพยอก\nสัมบูรณ์\nสัมปชัญญะ\nสัมปทา\nสัมปทาน\nสัมปยุต\nสัมปโยค\nสัมประสิทธิ์\nสัมประหาร\nสัมปรายภพ\nสัมปรายิกภพ\nสัมปัตติ\nสัมผัปลาป\nสัมผัปลาปะ\nสัมผัส\nสัมพล\nสัมพหุลา\nสัมพัจฉรฉินท์\nสัมพัตสร\nสัมพัทธ์\nสัมพันธ์\nสัมพันธน์\nสัมพันธภาพ\nสัมพันธมิตร\nสัมพันธไมตรี\nสัมพาหะ\nสัมพุทธ\nสัมพุทธะ\nสัมโพธิ\nสัมภวะ\nสัมภเวสี\nสัมภัต\nสัมภัตตะ\nสัมภาระ\nสัมภาษณ์\nสัมโภคกาย\nสัมมนา\nสัมมัปธาน\nสัมมา\nสัมโมทนียกถา\nสัมฤทธิ\nสัมฤทธิ์\nสัมฤทธิศก\nสัยน์\nสัลเลข\nสัสดี\nสัสตทิฐิ\nสัสสะ\nสัสสุ\nสัสสู\nสา\nส่า\nสาก\nสากรรจ์\nสากล\nสากัจฉา\nสากัลย์\nสากิยะ\nสาเก\nสาขา\nสาคร\nสาคเรศ\nสาคู\nสาง\nส้าง\nสาชล\nสาฎก\nสาฏิก\nสาณี\nสาด\nสาไถย\nสาทร\nสาทิส\nสาทุ\nสาโท\nสาธก\nสาธยะ\nสาธยาย\nสาธารณะ\nสาธารณชน\nสาธารณประโยชน์\nสาธารณภัย\nสาธารณรัฐ\nสาธารณสถาน\nสาธารณสมบัติ\nสาธารณสุข\nสาธารณูปการ\nสาธารณูปโภค\nสาธารณ์\nสาธิต\nสาธุ\nสาน\nส่าน\nสานุ\nสานู\nสานุศิษย์\nสาบ\nสาบสูญ\nสาบาน\nสาป\nสาปไตย\nสาม\nสามชุก\nสามเณร\nสามเณรี\nสามนต์\nสามนตราช\nสามยทรัพย์\nสามล\nสามหาว\nสามะ\nสามัคคี\nสามัญ\nสามัตถิยะ\nสามานย์\nสามานยนาม\nสามารถ\nสามิต\nสามินี\nสามิภักดิ์\nสามี\nสามีจิกรรม\nสาย\nส่าย\nส้าย\nสายชู\nสายัณห์\nสายาห์\nสาร\nสารคดี\nสารธรรม\nสารนิเทศ\nสารบบ\nสารบรรณ\nสารบัญ\nสารบาญ\nสารบาญชี\nสารประโยชน์\nสารสนเทศ\nสารทุกข์\nสารถี\nสารท\nสารพัด\nสารพัน\nสารพางค์\nสารภาพ\nสารภี\nสารวัตร\nสาระ\nสาระแน\nสาระพา\nสาระยำ\nสาระวารี\nสาระสะมา\nสารัตถประโยชน์\nสารัตถศึกษา\nสารัตถะ\nสารัทธ์\nสารัมภ์\nสาราณียกร\nสาราณียธรรม\nสาราณียะ\nสารานุกรม\nสารีริกธาตุ\nสารูป\nสาโรช\nสาละ\nสาละวน\nสาลิ\nสาลิกา\nสาลินี\nสาลี\nสาลี่\nสาลู\nสาโลหิต\nสาว\nสาวก\nสาวิกา\nสาวิตร\nสาวิตรี\nสาสน\nสาสน์\nสาส์น\nสาสนา\nสาสม\nสาหร่าย\nสาหรี\nส่าหรี\nสาหัส\nสาเหตุ\nสาแหรก\nสำ\nสำส่อน\nส่ำ\nสำคัญ\nสำซ่าง\nสำแดง\nสำทับ\nสำนวน\nสำนอง\nสำนัก\nสำนาน\nสำนึก\nสำนึง\nสำเนา\nสำเนียง\nสำบัด\nสำปะลอ\nสำปะหลัง\nสำปั้น\nสำปันนี\nสำเภา\nสำมะงา\nสำมะโน\nสำมะลอ\nสำมะเลเทเมา\nสำมะหา\nสำรด\nสำรวจ\nสำรวม\nสำรวย\nสำรวล\nสำรอก\nสำรอง\nสำรับ\nสำราก\nสำราญ\nสำริด\nสำเร็จ\nสำเรา\nสำเริง\nสำโรง\nสำลัก\nสำลาน\nสำลี\nสำแลง\nสำหรวด\nสำหรับ\nสำหา\nสำเหนียก\nสำเหร่\nสำออย\nสำอาง\nสิ\nสิกข์\nสิข\nสิกขมานา\nสิกขา\nสิขร\nสิขรี\nสิขเรศ\nสิขา\nสิขานล\nสิขี\nสิคาล\nสิง\nสิ่ง\nสิงขร\nสิงค์\nสิงคลิ้ง\nสิงคลี\nสิงคาร\nสิงคาล\nสิงคี\nสิงโต\nสิงห์\nสิงหนาท\nสิงหบัญชร\nสิงหรา\nสิงหราช\nสิงหาคม\nสิงหาสน์\nสิงหล\nสิญจน์\nสิตะ\nสิตางศุ์\nสิถิล\nสิทธ์\nสิทธัตถะ\nสิทธา\nสิทธาจารย์\nสิทธารถ\nสิทธิ\nสิทธิ์\nสิทธิการิยะ\nสิธยะ\nสิน\nสิ้น\nสินเทา\nสินธพ\nสินธุ\nสินธุ์\nสินธุระ\nสินธู\nสินเธาว์\nสินาด\nสินิทธ์\nสินี\nสิเนรุ\nสิเนหก\nสิเนหะ\nสิเนหา\nสิเน่หา\nสิบ\nสิปปะ\nสิมพลี\nสิระ\nสิโรดม\nสิโรตม์\nสิริ\nสิรี\nสิลา\nสิว\nสิ่ว\nสิวะ\nสิวาลัย\nสิวิกา\nสี\nสี่\nสี้\nสีกา\nสีกุน\nสีข้าง\nสีด\nสีดอ\nสีดา\nสีตลรัศมี\nสีตโลทก\nสีโตทก\nสีทันดร\nสีมันต์\nสีมา\nสีละมัน\nสีวิกา\nสีสอ\nสีสะ\nสีสา\nสีสุก\nสีเสียด\nสีห์\nสีหนาท\nสีหบัญชร\nสีหราช\nสีหไสยา\nสีหไสยาสน์\nสีหะ\nสึก\nสึง\nสืบ\nสื่อ\nสุ\nสุก\nสุกข์\nสุกร\nสุกรม\nสุกำศพ\nสุกียากี้\nสุข\nสุขา\nสุขาภิบาล\nสุขารมณ์\nสุขาวดี\nสุขิน\nสุขี\nสุขุม\nสุขุมาล\nสุโข\nสุคต\nสุคติ\nสุคนธ\nสุคนธ์\nสุคนธชาติ\nสุคนธรส\nสุคันธ์\nสุคันธรส\nสุงกะ\nสุงกากร\nสุงสิง\nสุงสุมาร\nสุจริต\nสุจหนี่\nสุจิ\nสุจิต\nสุจิตร\nสุชน\nสุชัมบดี\nสุชา\nสุชาดา\nสุญ\nสุญญากาศ\nสุญตา\nสุญนิยม\nสุณ\nสุณิสา\nสุด\nสุดา\nสุต\nสุตตนิบาต\nสุตตะ\nสุตตันตปิฎก\nสุตตันตะ\nสุติ\nสุทธ\nสุทธ์\nสุทธาวาส\nสุทธิ\nสุทรรศน์\nสุทัศน์\nสุธา\nสุธาโภชน์\nสุธารส\nสุธาสินี\nสุธาสี\nสุธี\nสุนทร\nสุนทรี\nสุนทรียภาพ\nสุนทรียศาสตร์\nสุนทรียะ\nสุนัข\nสุนันท์\nสุโนก\nสุบดี\nสุบรรณ\nสุบิน\nสุปรีดิ์\nสุปรีย์\nสุปาณี\nสุพพัต\nสุพรรณ\nสุพรรณบัฏ\nสุพรรณภาชน์\nสุพรรณราช\nสุพรรณศรี\nสุพรรณถัน\nสุพรรณิการ์\nสุภร\nสุภัค\nสุภา\nสุภาพ\nสุภาษิต\nสุม\nสุ่ม\nสุมทุม\nสุมน\nสุมนะ\nสุมนัส\nสุมนา\nสุ้มเสียง\nสุมะ\nสุมาลี\nสุเมธ\nสุเมรุ\nสุรคต\nสุรเชษฐ์\nสุรบดี\nสุรภาพ\nสุรโลก\nสุรสีหนาท\nสุรเสียง\nสุรงค์\nสุรังค์\nสุรภี\nสุรัติ\nสุรัสวดี\nสุรา\nสุรางค์จำเรียง\nสุรางคนา\nสุรางคนางค์\nสุรารักษ์\nสุราลัย\nสุรินทร์\nสุรินทราหู\nสุริยะ\nสุริยกันต์\nสุริยกานต์\nสุริยการ\nสุริยกาล\nสุริยคติ\nสุริยคราส\nสุริยมณฑล\nสุริยวงศ์\nสุริยง\nสุริยา\nสุริเยนทร์\nสุริเยศ\nสุริโย\nสุริยน\nสุริยัน\nสุริยุปราคา\nสุรีย์\nสุรุ่ยสุร่าย\nสุลต่าน\nสุวคนธ์\nสุวภาพ\nสุวรรณ\nสุวรรณภูมิ\nสุวะ\nสุวาน\nสุวินัย\nสุวิมล\nสุษิระ\nสุสาน\nสุหนัต\nสุหร่ง\nสุหร่าย\nสุหฤท\nสุหัท\nสุเหร่า\nสู\nสู่\nสู้\nสูง\nสูจิ\nสูจิบัตร\nสูญ\nสูด\nสูต\nสูตร\nสูติ\nสูติกรรม\nสูตินรีเวช\nสูติบัตร\nสูติแพทย์\nสูติศาสตร์\nสูท\nสูทกรรม\nสูทศาสตร์\nสูบ\nสูปะ\nสูร\nสูรย์\nสูรยกานต์\nสูริ\nสูสี\nเส\nเสก\nเสกขบุคคล\nเสกขะ\nเสขบุคคล\nเสขะ\nเส็ง\nเส้ง\nเส็งเคร็ง\nเสงี่ยม\nเสฏฐี\nเสณี\nเสด\nเสด็จ\nเสตุ\nเสถียร\nเสทะ\nเสโท\nเสน\nเส้น\nเสนง\nเสน่ง\nเสน่ห์\nเสนหา\nเสน่หา\nเสนอ\nเสนะ\nเสนา\nเสนาธิการ\nเสนาบดี\nเสน่า\nเสนากุฎ\nเสนางค์\nเสนางคนิกร\nเสนานี\nเสนาสนะ\nเสนาะ\nเสนี\nเสนีย์\nเสนียะ\nเสนียด\nเสบย\nเสบียง\nเสพ\nเสพย์\nเสเพล\nเสภา\nเสม็ด\nเสมหะ\nเสมอ\nเสมา\nเสมียน\nเสมือน\nเสย\nเสร็จ\nเสริด\nเสริม\nเสรี\nเสลด\nเสลบรรพต\nเสลา\nเสลี่ยง\nเสลือกสลน\nเสโล\nเสวก\nเสวกามาตย์\nเสวนะ\nเสวนา\nเสวย\nเสวียน\nเสสรวง\nเสสรวล\nเสา\nเส้า\nเสาร์\nเสารภย์\nเสารี\nเสาวคนธ์\nเสาวธาร\nเสาวภา\nเสาวภาคย์\nเสาวภาพ\nเสาวรภย์\nเสาวรส\nเสาวลักษณ์\nเสาวณิต\nเสาวนะ\nเสาวนา\nเสาวนีย์\nเสาหฤท\nเสาะ\nเสาะแสะ\nเสิร์จ\nเสิร์ฟ\nเสีย\nเสียง\nเสี่ยง\nเสียด\nเสี้ยน\nเสียบ\nเสียม\nเสี่ยม\nเสี้ยม\nเสียว\nเสี่ยว\nเสี้ยว\nเสือ\nเสื่อ\nเสื้อ\nเสือก\nเสื่อม\nแส\nแส่\nแส้\nแสก\nแสง\nแสด\nแสดง\nแสตมป์\nแสน\nแสนย์\nแสนยากร\nแสนยานุภาพ\nแสบ\nแสม\nแสยก\nแสยง\nแสยะ\nแสรก\nแสร้ง\nแสลง\nแสล้ม\nแสวง\nแสะ\nโสก\nโสกโดก\nโสกันต์\nโสโครก\nโสณฑ์\nโสณิ\nโสณี\nโสด\nโสดก\nโสดม\nโสดา\nโสดาบัน\nโสดาปัตติผล\nโสดาปัตติมรรค\nโสต\nโสตทัศนวัสดุ\nโสตทัศนอุปกรณ์\nโสตทัศนูปกรณ์\nโสตินทรีย์\nโสตถิ\nโสทก\nโสทร\nโสธก\nโสธนะ\nโสน\nโสภณ\nโสภา\nโสภี\nโสภิณี\nโสเภณี\nโสม\nโสมนัส\nโสมม\nโสมย์\nโสร่ง\nโสรจ\nโสรวาร\nโสโร\nโสวรรณ\nโสหุ้ย\nโสฬส\nใส\nใส่\nไส\nไส้\nไสย\nไสยา\nไสยาสน์\nไสร้\nไสว\nหก\nหกคะเมน\nหง\nหงก\nหงส์\nหงสบาท\nหงสรถ\nหงอ\nหงอก\nหง่อง\nหงองแหงง\nหงอด\nหงอน\nหง่อม\nหงอย\nหง่อย\nหงัก\nหงับ\nหง่าง\nหงาย\nหง่าว\nหงำ\nหงิก\nหงิง\nหงิม\nหงึก\nหงุงหงิง\nหงุดหงิด\nหงุบ\nหงุ่ย\nหญ้า\nหญ้าฝรั่น\nหญ้ายายเภา\nหญิง\nหญิบ\nหด\nหตะ\nหทัย\nหน\nหนวก\nหน่วง\nหนวด\nหน่วย\nหน่วยกิต\nหนอ\nหน่อ\nหนอก\nหนอง\nหนอน\nหนอนตายหยาก\nหน่อย\nหน็อยแน่\nหนัก\nหนัง\nหนังสติ๊ก\nหนังสือ\nหนั่น\nหนับ\nหนา\nหน้า\nหน่าง\nหนาด\nหนาน\nหนาม\nหน่าย\nหนาว\nหนำ\nหนำเลี้ยบ\nหนี\nหนี้\nหนีบ\nหนึก\nหนึ่ง\nหนึบ\nหนืด\nหนุ\nหนุน\nหนุบ\nหนุ่ม\nหนุ่ย\nหนู\nห่ม\nหมก\nหมด\nหม่น\nหมวก\nหมวด\nหมวน\nหมอ\nหม่อ\nหม้อ\nหมอก\nหมอง\nหม่อง\nหมอน\nหม่อน\nหมอบ\nหม่อม\nหมอย\nหม้อห้อม\nหมัก\nหมักหมม\nหมัด\nหมัน\nหมั่น\nหมั้น\nหมับ\nหมา\nหม่า\nหมาก\nหมากฮอส\nหมาง\nหมาด\nหมามุ่ย\nหมามุ้ย\nหมาย\nหม้าย\nหมาร่า\nหม่ำ\nหม้ำ\nหมิ่น\nหมี\nหมี่\nหมึก\nหมืน\nหมื่น\nหมุด\nหมุน\nหมุบ\nหมุบหมับ\nหมุบหมิบ\nหมุ่ย\nหมุยขาว\nหมู\nหมู่\nหมูหริ่ง\nหยก\nหย่ง\nหยด\nหยวก\nหยวบ\nหยอก\nหยอกเอิน\nหย็อกหย็อย\nหยอง\nหย็อง\nหย่อง\nหย็องกรอด\nหย็องแหย็ง\nหยอด\nหยอน\nหย่อน\nหย่อม\nหย็อมแหย็ม\nหย็อย\nหย่อย\nหยัก\nหยักไย่\nหยักเหยา\nหยัง\nหยั่ง\nหยังหยัง\nหยัด\nหยัน\nหยับ\nหยั่วเมือง\nหย่า\nหยากเยื่อ\nหยากไย่\nหยาด\nหยาบ\nหยาม\nหยาว\nหย้าว\nหยำเป\nหยำเหยอะ\nหยำแหยะ\nหยิก\nหยิ่ง\nหยิบ\nหยิม\nหยี\nหยี่\nหยุกหยิก\nหยุด\nหยุ่น\nหยุบ\nหยุมหยิม\nหยูกยา\nหโยดม\nหรคุณ\nหรณะ\nหรดาล\nหรดี\nหรรษ์\nหรรษา\nหรอ\nหรอก\nหร็อมแหร็ม\nหรอย\nหระ\nหรับ\nหรา\nหริ\nหริ่ง\nหริณะ\nหริต\nหริตกี\nหรีตกี\nหรี่\nหรีด\nหรือ\nหรุบ\nหรุบรู่\nหรุบหรู่\nหรุ่ม\nหรู\nหรูหรา\nหฤทัย\nหฤทย์\nหฤษฎ์\nหฤษฎี\nหฤหรรษ์\nหฤโหด\nหลง\nหลงใหล\nหลงจู๊\nหลด\nหลน\nหล่น\nหลบ\nหล่ม\nหลวง\nหลวม\nหลอ\nหล่อ\nหลอก\nหลอด\nหลอน\nหล็อน\nหล่อน\nหลอม\nหละ\nหละหลวม\nหลัก\nหลัง\nหลั่ง\nหลัด\nหลั่น\nหลับ\nหลัว\nหลา\nหล้า\nหลาก\nหลาน\nหลาบ\nหลาม\nหลาย\nหลาว\nหลิ่ง\nหลิท\nหลิน\nหลิม\nหลิว\nหลิ่ว\nหลี\nหลีก\nหลีโก\nหลีบ\nหลีฮื้อ\nหลืบ\nหลุกหลิก\nหลุด\nหลุน\nหลุบ\nหลุม\nหลุมพอ\nหลุมพี\nหลู่\nหวง\nห่วง\nห้วง\nหวด\nหวน\nห้วน\nหวย\nห้วย\nหวอ\nหวอด\nหวะ\nหวัง\nหวัด\nหวั่น\nหวันยิหวา\nหวัว\nหวัวร่อ\nหวัวเราะ\nหวา\nหว่า\nหว้า\nหวาก\nหว่าง\nหวาด\nหวาน\nหว่าน\nหวาม\nหวาย\nหวำ\nหวิด\nหวิว\nหวี\nหวี่\nหวีด\nหวือ\nหวุดหวิด\nหวูด\nหอ\nห่อ\nห้อ\nหอก\nหอง\nห้อง\nหอน\nห่อน\nหอบ\nหอม\nห้อม\nหอย\nห้อย\nหะ\nหะยี\nหะหาย\nหัก\nหัจญ์\nหัจญี\nหัช\nหัฏฐะ\nหัด\nหัต\nหัตถ์\nหัตถกรรม\nหัตถการ\nหัตถกิจ\nหัตถบาส\nหัตถพันธ์\nหัตถาภรณ์\nหัตถศาสตร์\nหัตถศิลป์\nหัตถศึกษา\nหัตถาจารย์\nหัตถินี\nหัตถี\nหัน\nหั่น\nหั้น\nหันตรา\nหับ\nหัย\nหัว\nหัวร่อ\nหัวเราะ\nหัส\nหัสดิน\nหัสดี\nหัสต์\nหัสตะ\nหา\nห่า\nห้า\nหาก\nหาง\nห่าง\nห้าง\nหาญ\nหาด\nห่าน\nหาบ\nหาม\nห่าม\nห้าม\nหาย\nหายใจ\nหายนะ\nหาร\nหารือ\nหาว\nห้าว\nหาสะ\nหำ\nห้ำ\nหิ้ง\nหิงคุ\nหิงสา\nหิงห้อย\nหิ่งห้อย\nหิ่งหาย\nหิด\nหิต\nหิตานุหิตประโยชน์\nหิน\nหิมพาน\nหิมพานต์\nหิมวัต\nหิมวันต์\nหิมวา\nหิมะ\nหิมาลัย\nหิรัญ\nหิรัญญิการ์\nหิรัญบัฏ\nหิรัณย์\nหิรัณยรัศมี\nหิริ\nหิว\nหิ้ว\nหี\nหีด\nหีนยาน\nหีบ\nหึ\nหึง\nหึ่ง\nหึงสา\nหืด\nหืน\nหื่น\nหือ\nหื้อ\nหุง\nหุน\nหุ่น\nหุ้น\nหุนหัน\nหุบ\nหุ้ม\nหุยฮา\nหู\nหู่\nหูก\nหูด\nเห\nเห่\nเหง\nเหง่ง\nเหงา\nเหง้า\nเหงื่อ\nเหงือก\nเห็จ\nเห็ด\nเหติ\nเหตุ\nเห็น\nเหน่ง\nเหนงนายพราน\nเหน็ดเหนื่อย\nเหน็บ\nเหน่อ\nเห็นอ้ม\nเหนอะ\nเหนอะหนะ\nเหน้า\nเหนาะ\nเหนียง\nเหนี่ยง\nเหนี่ยน\nเหนียม\nเหนียว\nเหนี่ยว\nเหนือ\nเหนื่อย\nเห็บ\nเหม\nเหม่\nเหม็ง\nเหม่ง\nเหม็น\nเหมวดี\nเหม่อ\nเหมันต์\nเหมันตฤดู\nเหมา\nเหมายัน\nเหมาะ\nเหมียว\nเหมี่ยว\nเหมือง\nเหมือด\nเหมือน\nเหมื่อย\nเหย\nเหยง\nเหย่อย\nเหยา\nเหย่า\nเหย้า\nเหยาะ\nเหยาะแหยะ\nเหยิง\nเหยิบ\nเหยียด\nเหยียบ\nเหยี่ยว\nเหยื่อ\nเหยือก\nเหรอ\nเหรอะ\nเหรัญญิก\nเหรา\nเหราะ\nเหรียญ\nเหล่\nเหล็ก\nเหลน\nเหลว\nเหลอ\nเหลา\nเหล่า\nเหล้า\nเหลาะแหละ\nเหลิง\nเหลิงเจิ้ง\nเหลียน\nเหลี่ยม\nเหลียว\nเหลือ\nเหลือก\nเหลือง\nเหลือบ\nเหลือม\nเหลื่อม\nเหว\nเหว่\nเหวง\nเหวย\nเหวอะ\nเหวอะหวะ\nเหวี่ยง\nเห่อ\nเหอะ\nเหะ\nเหะหะ\nเหา\nเห่า\nเหาะ\nเหิน\nเหิม\nเหี้ย\nเหียง\nเหียน\nเหี้ยน\nเหี้ยม\nเหี่ยว\nเหื่อ\nเหือด\nแห\nแห่\nแห้\nแหก\nแหง\nแหง่\nแห่ง\nแห้ง\nแหงแก๋\nแหง่ง\nแหงน\nแหน\nแห้น\nแหนง\nแหนบ\nแหนม\nแหบ\nแหม\nแหม่\nแหม่ม\nแหมะ\nแหย\nแหย่\nแหยง\nแหย่ง\nแหยม\nแหย็ม\nแหยะ\nแหล่\nแหลก\nแหล่ง\nแหลน\nแหลม\nแหละ\nแหว\nแห้ว\nแหวก\nแหว่ง\nแหวด\nแหวน\nแหวะ\nแหะ\nโห่\nโหง\nโหด\nโหน\nโหนก\nโหน่ง\nโหม\nโหม่ง\nโหมด\nโหย\nโหยกเหยก\nโหยง\nโหย่ง\nโหร\nโหรง\nโหรงเหรง\nโหรดาจารย์\nโหระพา\nโหรา\nโหราจารย์\nโหราศาสตร์\nโหล\nโหล่\nโหลงโจ้ง\nโหว\nโหว่\nโหว้\nโหวกเหวก\nโหวง\nโหวด\nโหวต\nให้\nใหญ่\nใหม่\nไห\nไห่\nไห้\nไหน\nไหม\nไหม้\nไหรณย์\nไหล\nไหล่\nไหว\nไหว้\nไหหลำ\nอก\nอกตเวทิตา\nอกตเวที\nอกตัญญุตา\nอกตัญญู\nอกนิษฐ์\nอกรณีย์\nอกรรมกริยา\nอกัปปิยวัตถุ\nอกัปปิยะ\nอกุศล\nอคติ\nอคาธ\nอโฆษะ\nองก์\nองค์\nองคชาต\nองคมนตรี\nองครักษ์\nองคาพยพ\nองคุลี\nองศ์\nองศา\nองอาจ\nองุ่น\nอจลา\nอจินตา\nอจินไตย\nอจิระ\nอเจลก\nอเจละ\nอชะ\nอชิน\nอชินี\nอชิระ\nอฏวี\nอณิ\nอณู\nอโณทัย\nอด\nอดิถี\nอดิเทพ\nอดิเรก\nอดิศร\nอดิศวร\nอดิศัย\nอดีต\nอดุล\nอดุลย์\nอติ\nอติชาต\nอติมานะ\nอติราช\nอติเรก\nอติสาร\nอถรรพเวท\nอาถรรพเวท\nอทระ\nอทินนาทาน\nอธรรม\nอธิ\nอธิกมาส\nอธิกรณ์\nอธิกวาร\nอธิกสุรทิน\nอธิการ\nอธิคม\nอธิฏฐาน\nอธิบดี\nอธิบาย\nอธิป\nอธิปไตย\nอธิมาตร\nอธิมุตติ\nอธิโมกข์\nอธิราช\nอธิวาส\nอธิวาสนะ\nอธิศีล\nอธิษฐาน\nอธึก\nอ้น\nอนงค์\nอนงคณะ\nอนงคเลขา\nอนธการ\nอนนต์\nอนยะ\nอนรรฆ\nอนรรถ\nอนล\nอนวัช\nอนัญ\nอนัตตา\nอนันต์\nอนันตริยกรรม\nอนัม\nอนาคต\nอนาคามิผล\nอนาคามิมรรค\nอนาคามี\nอนาจาร\nอนาถ\nอนาถา\nอนาทร\nอนาธิปไตย\nอนามัย\nอนามิกา\nอนารยชน\nอนารยธรรม\nอนารยะ\nอนาลัย\nอนำ\nอนิจ\nอนิจจัง\nอนิจจา\nอนิฏฐารมณ์\nอนิยต\nอนิยม\nอนิล\nอนิวรรต\nอนิวรรตน์\nอนีกะ\nอนีจะ\nอนึก\nอนึ่ง\nอนุ\nอนุกร\nอนุกรม\nอนุกรรมการ\nอนุกระเบียด\nอนุกาชาด\nอนุการ\nอนุกูล\nอนุคามิก\nอนุเคราะห์\nอนุจร\nอนุช\nอนุชน\nอนุชา\nอนุชาต\nอนุชิต\nอนุญาต\nอนุญาโตตุลาการ\nอนุตร\nอนุเถระ\nอนุทิน\nอนุบท\nอนุบาล\nอนุประโยค\nอนุปริญญา\nอนุปสัมบัน\nอนุปัสนา\nอนุพงศ์\nอนุพัทธ์\nอนุพันธ์\nอนุโพธ\nอนุภรรยา\nอนุภริยา\nอนุภาค\nอนุภาษ\nอนุมัติ\nอนุมาตรา\nอนุมาน\nอนุมูล\nอนุโมทนา\nอนุโยค\nอนุรักษ์\nอนุรักษนิยม\nอนุราช\nอนุราธ\nอนุราธะ\nอนุราธา\nอนุรูป\nอนุโลม\nอนุวงศ์\nอนุวรรตน์\nอนุวัต\nอนุวัตน์\nอนุวัตร\nอนุวัติ\nอนุวาต\nอนุศาสก\nอนุศาสน์\nอนุศาสนาจารย์\nอนุศิษฏ์\nอนุสติ\nอนุสนธิ\nอนุสร\nอนุสรณ์\nอนุสัญญา\nอนุสัย\nอนุสาวรีย์\nอนุสาสนี\nอเนก\nอเนกคุณ\nอเนกรรถประโยค\nอเนจอนาถ\nอโนชา\nอโนดาต\nอบ\nอบเชย\nอบาย\nอปจายนธรรม\nอปจายนมัย\nอปมงคล\nอปยศ\nอประไมย\nอปรัณณชาติ\nอปรา\nอปราชัย\nอปราชิต\nอปริมาณ\nอปลักษณ์\nอปโลกน์\nอปวาท\nอเปหิ\nอพพะ\nอพยพ\nอภัพ\nอภัย\nอภิ\nอภิฆาต\nอภิชฌา\nอภิชน\nอภิชนาธิปไตย\nอภิชัย\nอภิชาต\nอภิชิต\nอภิญญา\nอภิญญาณ\nอภิธรรม\nอภิธาน\nอภิไธย\nอภินันท์\nอภินันทนาการ\nอภินัย\nอภินิหาร\nอภิเนษกรมณ์\nอภิบาล\nอภิปรัชญา\nอภิปราย\nอภิมหาอำนาจ\nอภิมานะ\nอภิมุข\nอภิรดี\nอภิรติ\nอภิรมย์\nอภิรักษ์\nอภิราม\nอภิรุต\nอภิรุม\nอภิรูป\nอภิลักขิต\nอภิลักขิตสมัย\nอภิเลปน์\nอภิวันท์\nอภิวาท\nอภิวาทน์\nอภิเษก\nอภิสมโพธิ\nอภิสมัย\nอภิสมาจาร\nอภิสัมโพธิ\nอภิสัมโพธิญาณ\nอภิสิต\nอภิสิทธิ์\nอภูตะ\nอม\nอมพะนำ\nอ้ม\nอมตธรรม\nอมตบท\nอมตะ\nอมนุษย์\nอมร\nอมรา\nอมราวดี\nอมรินทร์\nอมเรนทร์\nอมเรศ\nอมฤต\nอมัตร\nอมาตย์\nอมาวสี\nอมาวสุ\nอมาวาสี\nอมิตร\nอเมริกัน\nอย่า\nอยาก\nอย่าง\nอยุทธ์\nอยู่\nอร\nอรชร\nอรชุน\nอรดี\nอรติ\nอรทัย\nอรไท\nอรนุช\nอรพินท์\nอรพิม\nอรรค\nอรรฆ\nอรรฆย์\nอรรจน์\nอรรณพ\nอรรถ\nอรรถกถา\nอรรถกถาจารย์\nอรรถาธิบาย\nอรรธ\nอรสุม\nอรหะ\nอรหัง\nอรหัต\nอรหัตผล\nอรหัตมรรค\nอรหัน\nอรหันต์\nอรหันตฆาต\nอร่อย\nอรัญ\nอรัญญิก\nอรัญวาส\nอรัญวาสี\nอรัณย์\nอราดี\nอร่าม\nอริ\nอรินทร์\nอริน\nอริยกะ\nอริยทรัพย์\nอริยบุคคล\nอริยผล\nอริยมรรค\nอริยสัจ\nอริยะ\nอรุณ\nอรุโณทัย\nอรุ่ม\nอรูป\nอลงกต\nอลงกรณ์\nอลงการ\nอลวน\nอลเวง\nอลหม่าน\nอล่องฉ่อง\nอลักเอลื่อ\nอลังการ\nอลัชชี\nอล่างฉ่าง\nอลิงค์\nอลึงค์\nอลึ่งฉึ่ง\nอโลหะ\nอ้วก\nอวกาศ\nอวจร\nอวชัย\nอวชาต\nอวด\nอวตาร\nอวน\nอ้วน\nอวบ\nอวมงคล\nอวย\nอวยวะ\nอวรรค\nอวรุทธ์\nอวรุทธก\nอวล\nอวสาน\nอวหาร\nอวัยวะ\nอวัสดา\nอวาจี\nอวิจี\nอวิชชา\nอวิญญาณกทรัพย์\nอวิญญู\nอวิรุทธ์\nอวิโรธน์\nอวิโรธนะ\nอวิหิงสา\nอวีจิ\nอเวจี\nอโศก\nอสงไขย\nอสนี\nอัสนี\nอสนีบาต\nอสภะ\nอสมการ\nอสมมาตร\nอสรพิษ\nอสังหาริมทรัพย์\nอสังหาริมะ\nอสัญกรรม\nอสัญญี\nอสัญแดหวา\nอสัตถพฤกษ์\nอัสสัตถพฤกษ์\nอสัตย์\nอสัมภิน\nอสัมภินพงศ์\nอสัมภินวงศ์\nอสิ\nอสิธารา\nอสิต\nอสิเลสะ\nอสีตยานุพยัญชนะ\nอสีติ\nอสุ\nอสุจิ\nอสุนีบาต\nอสุภ\nอสุรกาย\nอสุรา\nอสุรี\nอสุเรศ\nอสูร\nอเสกขบุคคล\nอเสกขะ\nอเสขบุคคล\nอเสขะ\nอหังการ\nอหิ\nอหิงสา\nอหิวาต์\nอหิวาตกโรค\nอหึงสา\nอเหตุกทิฐิ\nอโหสิ\nออ\nอ้อ\nอ๋อ\nออก\nออกซิเจน\nออกซิเดชัน\nออกไซด์\nออกญา\nอ่อง\nอ๋อง\nอ้องแอ้ง\nออเซาะ\nออด\nอ๊อด\nอ๊อดแอ๊ด\nอ่อน\nอ้อน\nออนซ์\nออนซอน\nอ้อนแอ้น\nออฟฟิศ\nออม\nอ่อม\nอ้อม\nออมชอม\nออมซอม\nอ้อมแอ้ม\nอ่อย\nอ้อย\nอ๋อย\nอ้อยส้อย\nอ้อยอิ่ง\nออสเมียม\nอ้อแอ้\nอ๊ะ\nอะคร้าว\nอะเคื้อ\nอะแจ\nอะเซทิลีน\nอะดรีนาลิน\nอะดุง\nอะตอม\nอะมีบา\nอะเมริเซียม\nอะร้าอร่าม\nอะไร\nอะลุ่มอล่วย\nอะลุ้มอล่วย\nอะลูมิเนียม\nอะหม\nอะไหล่\nอัก\nอั้ก\nอั๊ก\nอักกะ\nอักโกธะ\nอักขรวิธี\nอักขรวิบัติ\nอักขรสมัย\nอักขระ\nอักขรานุกรม\nอักขะ\nอักโข\nอักโขภิณี\nอักโขเภณี\nอักษร\nอักษะ\nอักเษาหิณี\nอักเสบ\nอักอ่วน\nอัคคะ\nอัคคิ\nอัคคี\nอัคนิ\nอัคนี\nอัคร\nอัครชายา\nอัครมเหสี\nอัครราชทูต\nอัครสมณทูต\nอัง\nอังก์\nอังกนะ\nอังกฤษ\nอังกะลุง\nอังกา\nอังกาบ\nอังกุระ\nอังกุศ\nอังกูร\nอังคณะ\nอังคณา\nอังคาร\nอังคาส\nอังคีรส\nอังคุฐ\nอังคุตรนิกาย\nอังฆาต\nอังแพลม\nอั้งยี่\nอั้งโล่\nอังศุ\nอังศุก\nอังศุธร\nอังศุมาลี\nอังสกุฏ\nอังสตรอม\nอังสนา\nอังสภาระ\nอังสะ\nอังสา\nอัจกลับ\nอัจจิ\nอัจจิมา\nอัจจุตะ\nอัจฉรา\nอัจฉริยบุคคล\nอัจฉริยภาพ\nอัจฉริยลักษณ์\nอัจฉริยลักษณะ\nอัจฉริยะ\nอัจนา\nอัชฌัตติก\nอัชฌา\nอัชฌาจาร\nอัชฌาศัย\nอัชฌาสัย\nอัญเดียรถีย์\nอัญมณี\nอัญขยม\nอัญชนะ\nอัญชลี\nอัญชัน\nอัญชุลี\nอัญเชิญ\nอัญญะ\nอัญดิตถีย์\nอัญเดียรถีย์\nอัญประกาศ\nอัญรูป\nอัฏ\nอัฏฐบาน\nอัฏฐะ\nอัฏฐังคิกมรรค\nอัฏนา\nอัฐ\nอัฐฬส\nอัฐเคราะห์\nอัฐทิศ\nอัฐบริขาร\nอัฐบาน\nอัฐม\nอัฐมี\nอัฐศก\nอัฐิ\nอัฒจันทร์\nอัฒภาค\nอัฒมาส\nอัฒรัตติ\nอัณฑโกส\nอัณฑชะ\nอัณฑะ\nอัณณพ\nอัด\nอัดแจ\nอัต\nอัตชีวประวัติ\nอัตนัย\nอัตภาพ\nอัตวินิบาตกรรม\nอัตคัด\nอัตตโนบท\nอัตตา\nอัตตาธิปไตย\nอัตถ์\nอัตถะ\nอัตโนมัติ\nอัตรชะ\nอัตรา\nอัตลัด\nอัททา\nอัทธ์\nอัทธา\nอัทธาน\nอัทธายุ\nอัธยาตมวิทยา\nอัธยาย\nอัธยาศัย\nอัน\nอั้น\nอั๋น\nอันดร\nอันดับ\nอันตกะ\nอันตกาล\nอันตะ\nอันตคุณ\nอันตรภาค\nอันตรวาสก\nอันตรธาน\nอันตรา\nอันตราย\nอันตรายิกธรรม\nอันติกะ\nอันติมสัจ\nอันติมะ\nอันเต\nอันโต\nอันเตปุริก\nอันเตวาสิก\nอันแถ้ง\nอันโทล\nอันธการ\nอันธพาล\nอันธิกา\nอันเวส\nอับ\nอับปาง\nอัปปะ\nอัปเปหิ\nอัปภาคย์\nอัปมงคล\nอัปยศ\nอัประมาณ\nอัประไมย\nอัปราชัย\nอัปรีย์\nอัปลักษณ์\nอัปสร\nอัพพุท\nอัพโพหาริก\nอัพภันดร\nอัพภาน\nอัพภาส\nอัพภูตธรรม\nอัพยากฤต\nอัมพฤกษ์\nอัมพาต\nอัมพวัน\nอัมพวา\nอัมพร\nอัมพา\nอัมพิละ\nอัมพุ\nอัมพุช\nอัมพุชินี\nอัมพุท\nอัยกา\nอัยการ\nอัยกี\nอัยยะ\nอัยยิกา\nอัลกุรอาน\nอัลตราไวโอเลต\nอั่ว\nอัศจรรย์\nอัศเจรีย์\nอัศว\nอัศวเมธ\nอัศวยุช\nอัศวานึก\nอัศวิน\nอัศวินี\nอัษฎมงคล\nอัษฏมงคล\nอัษฎางคิกมรรค\nอัษฎายุธ\nอัษฎาวุธ\nอัสสะ\nอัสดร\nอัสกัณ\nอัสดง\nอัสดงคต\nอัสมิมานะ\nอัสสนี\nอัสสานึก\nอัสสาสะ\nอัสสุ\nอา\nอ่า\nอ้า\nอ๋า\nอากร\nอากังขา\nอากัป\nอาการ\nอากาศ\nอากูล\nอาเกียรณ์\nอาขยาต\nอาขยาน\nอาคเนย์\nอาคม\nอาครหายณี\nอาคันตุกะ\nอาคันตุกภัต\nอาคันตุกวัตร\nอาคาร\nอาฆาต\nอ่าง\nอ้าง\nอางขนาง\nอ้างว้าง\nอาจ\nอาจม\nอาจริยวัตร\nอาจริยวาท\nอาจาด\nอาจาร\nอาจารย์\nอาจารี\nอาจิณ\nอาเจียน\nอาชญา\nอาชวะ\nอาชา\nอาชาไนย\nอาชีพ\nอาชีวศึกษา\nอาชีวะ\nอาชีวก\nอาญา\nอาฏานา\nอาณัติ\nอาณา\nอาด\nอาดูร\nอาดุลย์\nอาดูลย์\nอาเด๊ะ\nอาตมภาพ\nอาตมัน\nอาตมา\nอาถรรพ์\nอาถรรพณ์\nอาทร\nอาทิ\nอาทิจจวาร\nอาทิตย์\nอาทิตยมณฑล\nอาทิตยวาร\nอาทีนพ\nอาทีนวะ\nอาทึก\nอาเทศ\nอาเทสนา\nอาธรรม\nอาธรรม์\nอาธาน\nอาธาร\nอาน\nอ่าน\nอานน\nอานนท์\nอานันท์\nอานันทนะ\nอานัม\nอานาปานะ\nอานาปานัสสติ\nอานิสงส์\nอานุภาพ\nอานุภาวะ\nอาบ\nอาบัติ\nอาบัน\nอาปณกะ\nอาปณะ\nอาปะ\nอาโป\nอาปานะ\nอาพัทธ์\nอาพันธ์\nอาพันธนะ\nอาพาธ\nอาเพศ\nอาภรณ์\nอาภัพ\nอาภัสระ\nอาภา\nอาภาส\nอามลกะ\nอามัย\nอามิษ\nอามิส\nอาย\nอ้าย\nอายตนะ\nอายตะ\nอายน\nอายัด\nอายัต\nอายัน\nอายาจนะ\nอายานะ\nอายุ\nอายุตกะ\nอายุธ\nอายุรกรรม\nอายุรแพทย์\nอายุรเวช\nอายุรเวท\nอายุศาสตร์\nอายุษ\nอาร์กอน\nอารดี\nอารติ\nอาร์ต\nอารทรา\nอาร์ม\nอารมณ์\nอารยชน\nอารยชาติ\nอารยธรรม\nอารยประเทศ\nอารยะ\nอารยัน\nอาระ\nอารักขา\nอารักษ์\nอารัญ\nอารัณย์\nอารัญญิก\nอารัณยกะ\nอารัติ\nอารัมภ์\nอารัมภกถา\nอารัมภบท\nอารัมภะ\nอาราธน์\nอาราธนา\nอาราม\nอารามิก\nอารี\nอารุม\nอาลปนะ\nอาละวาด\nอาลักษณ์\nอาลัย\nอาลัว\nอาลี\nอาโลก\nอาว\nอ่าว\nอ้าว\nอาวรณ์\nอาวัชนาการ\nอาวัล\nอาวาส\nอาวาสิก\nอาวาหมงคล\nอาวาหะ\nอาวุต\nอาวุธ\nอาวุโส\nอาเวค\nอาศรม\nอาศรมบท\nอาศเลษา\nอาศัย\nอาศิรพจน์\nอาศิรพาท\nอาศิรวาท\nอาศุ\nอาเศียรพจน์\nอาเศียรพาท\nอาเศียรวาท\nอาษาฒ\nอาสน\nอาสน์\nอาสนะ\nอาสนศาลา\nอาสัญ\nอาสัตย์\nอาสา\nอาสาฬห์\nอาสาฬหบูชา\nอาสาฬหะ\nอาสิญจ์\nอาสิน\nอาหม\nอาหรับ\nอาหาร\nอาฬหก\nอำ\nอ่ำ\nอ้ำ\nอำแดง\nอำนนต์\nอำนรรฆ\nอำนวย\nอำนาจ\nอำนาถ\nอำนิฐ\nอำนิษฐ์\nอำปลัง\nอำพน\nอำพล\nอำพะนำ\nอำพัน\nอำไพ\nอำเภอ\nอำมร\nอำมฤคโชค\nอำมฤต\nอำมหิต\nอำมาตย์\nอำมาตยาธิปไตย\nอำยวน\nอำรุง\nอำลา\nอำอวม\nอ้ำอึ้ง\nอิก\nอิง\nอิงค์\nอิงอร\nอิจฉา\nอิฉัน\nอิชยา\nอิฏฐารมณ์\nอิฐ\nอิฐผล\nอิด\nอิตถี\nอิตเทรียม\nอิตเทอร์เบียม\nอิติวุตตกะ\nอิทธิ\nอิน\nอินซูลิน\nอินเดีย\nอินเดียนแดง\nอินเดียม\nอินท์\nอินทขีล\nอินทนิล\nอินทผลัม\nอินทร์\nอินทรธนู\nอินทรวงศ์\nอินทรวิเชียร\nอินทรศักดิ์\nอินทราณี\nอินทราภิเษก\nอินทรายุธ\nอินทรี\nอินทรีย์\nอินทรียสังวร\nอินทีวร\nอินทุ\nอินฟราเรด\nอินัง\nอิ่ม\nอิมัลชัน\nอิริเดียม\nอิริยา\nอิริยาบถ\nอิรุพเพท\nอิเล็กตรอน\nอิเล็กทรอนิกส์\nอิเล็กโทน\nอิศร\nอิศวร\nอิษฏ์\nอิษฏี\nอิส\nอิสตรี\nอิสัตรี\nอิสรภาพ\nอิสระ\nอิสริยยศ\nอิสริยะ\nอิสริยาภรณ์\nอิสลาม\nอิสสา\nอิสิ\nอิสี\nอิหม่าม\nอิหลักอิเหลื่อ\nอี\nอี่\nอี้\nอี๊\nอี๋\nอี๋อ๋อ\nอีก\nอีก้อ\nอีก๋อย\nอีโก้ง\nอีจู้\nอี๊ด\nอีเต้อ\nอีโต้\nอีทุบ\nอีเทอร์\nอีนุงตุงนัง\nอีนูน\nอีโน\nอีแปะ\nอีโปง\nอีเพา\nอีมู\nอีรม\nอีลุ้ม\nอีลุ่ยฉุยแฉก\nอีเลิ้ง\nอีศ\nอีศวร\nอีส\nอีสาน\nอีสุกอีใส\nอีหรอบ\nอีหลักอีเหลื่อ\nอีหลี\nอีหลุกขลุกขลัก\nอีหลุกขลุกขลุ่ย\nอีเห็น\nอีเหน็บ\nอีเหนียว\nอีเหละเขละขละ\nอีเหละเขะขะ\nอีโหน่อีเหน่\nอีโหลกโขลกเขลก\nอีแอ่น\nอึ\nอึก\nอึ้ก\nอึ๊ก\nอึกทึก\nอึกอัก\nอึง\nอึ่ง\nอึ้ง\nอึด\nอึดตะปือ\nอึ้ดทึ่ด\nอึน\nอืด\nอื่น\nอื้น\nอือ\nอื้อ\nอื้อฮือ\nอุ\nอุก\nอุกกา\nอุกกาบาต\nอุกฤษฏ์\nอุกลาบาต\nอุค\nอุคระ\nอุคหนิมิต\nอุโฆษ\nอุ้ง\nอุจ\nอุจจาระ\nอุจฉุ\nอุจเฉท\nอุจเฉททิฐิ\nอุจาด\nอุชุ\nอุฏฐาการ\nอุณหภูมิ\nอุณหะ\nอุณหาหาร\nอุณหิส\nอุณา\nอุณาโลม\nอุด\nอุดเตา\nอุดม\nอุดมการณ์\nอุดมคติ\nอุดมศึกษา\nอุดร\nอุดหนุน\nอุตดม\nอุตตมะ\nอุตมภาพ\nอุตมางค์\nอุตดร\nอุตรกุรุทวีป\nอุตตรายัน\nอุตรนิกาย\nอุตรผลคุนี\nอุตตรผลคุนี\nอุตรภัทรบท\nอุตตรภัทรบท\nอุตตรภัททะ\nอุตราภิมุข\nอุตราวรรต\nอุตราวัฏ\nอุตราษาฒ\nอุตตราสาฬหะ\nอุตราสงค์\nอุตตานภาพ\nอุตพิด\nอุตรา\nอุตริ\nอุตริมนุสธรรม\nอุตลุด\nอุตส่าห์\nอุตสาหกรรม\nอุตสาหะ\nอุตุ\nอุตุนิยม\nอุตุนิยมวิทยา\nอุทก\nอุทกธาร\nอุทกธารา\nอุทกภัย\nอุทกวิทยา\nอุทกศาสตร์\nอุทธรณ์\nอุทธัจ\nอุทยาน\nอุทร\nอุทริยะ\nอุทลุม\nอุทัช\nอุทัย\nอุทาน\nอุทาร\nอุทาหรณ์\nอุทิศ\nอุทุมพร\nอุเทศ\nอุเทสิกเจดีย์\nอุธัจ\nอุ่น\nอุบ\nอุบล\nอุบะ\nอุบ๊ะ\nอุบัติ\nอุบาท\nอุบาทว์\nอุบาย\nอุบาสก\nอุบาสิกา\nอุเบกขา\nอุโบสถ\nอุปกรณ์\nอุปกรม\nอุปการ\nอุปการะ\nอุปการี\nอุปกิเลส\nอุปจาร\nอุปถัมภ์\nอุปถัมภก\nอุปทม\nอุปทูต\nอุปเทศ\nอุปเท่ห์\nอุปธิ\nอุปนัย\nอุปนิกขิต\nอุปนิษัท\nอุปนิสัย\nอุปบัติ\nอุปปาติกะ\nอุปพัทธ์\nอุปพันธ์\nอุปโภค\nอุปมา\nอุปมาน\nอุปไมย\nอุปยุวราช\nอุปรากร\nอุปราคา\nอุปราช\nอุปริ\nอุปริมปริยาย\nอุปโลกน์\nอุปเวท\nอุปสมบท\nอุปสมบัน\nอุปสัมบัน\nอุปสรรค\nอุปสัมปทา\nอุปฮาด\nอุปัชฌาย์\nอุปัชฌายวัตร\nอุปัชฌายะ\nอุปัฏฐาก\nอุปัฏฐานะ\nอุปัทวะ\nอุปัทวันตราย\nอุปาทาน\nอุปาหนา\nอุภัย\nอุ้ม\nอุมงค์\nอุโมงค์\nอุย\nอุ่ย\nอุ้ย\nอุ๊ย\nอุยยาน\nอุยยาม\nอุรณะ\nอุรพี\nอุระ\nอุรังอุตัง\nอุรัจฉัท\nอุรัจฉทะ\nอุรา\nอุรุ\nอุไร\nอุลกมณี\nอุลโลจ\nอุลามก\nอุลิด\nอุโลก\nอุแว้\nอุษณกร\nอุษณกาล\nอุษณรัศมี\nอุษณรุจี\nอุษณาการ\nอุษณีษ์\nอุษมะ\nอุษมัน\nอุษา\nอุษาโยค\nอุสภ\nอุสส่าห์\nอุสสาหะ\nอุสา\nอุสุ\nอุสุภ\nอุสุภราช\nอุสุม\nอุหรับ\nอุหลบ\nอุเหม่\nอุฬาร\nอู\nอู่\nอู้\nอูฐ\nอูด\nอูม\nอูย\nอูรุ\nอู๋อี๋\nเอ\nเอ้\nเอ๊\nเอก\nเอกเขนก\nเอกซเรย์\nเอกรรถประโยค\nเอกัคตา\nเอกังสพยากรณ์\nเอกังสวาที\nเอกา\nเอ้กา\nเอกาธิปไตย\nเอเคอร์\nเอง\nเอ็ง\nเอ๋ง\nเอ็ด\nเอ็ดตะโร\nเอดส์\nเอตทัคคะ\nเอ้เต\nเอทิล\nเอน\nเอ็น\nเอนไซม์\nเอ็นดู\nเอ็นอ่อน\nเอม\nเอ็มบริโอ\nเอย\nเอ่ย\nเอ๊ย\nเอ๋ย\nเอร็ดอร่อย\nเอราวัณ\nเอว\nเอ๊ว\nเอวัง\nเอฬกะ\nเอฬา\nเออ\nเอ่อ\nเออร์เบียม\nเอ้อระเหย\nเอ้อเร้อ\nเอ้อเฮอ\nเอ๊ะ\nเอะใจ\nเอะอะ\nเอะอะมะเทิ่ง\nเอา\nเอาทาร\nเอาทารย์\nเอารส\nเอาฬาร\nเอาะลาย\nเอิก\nเอิกเกริก\nเอิ้น\nเอิบ\nเอียง\nเอี้ยง\nเอียด\nเอี๊ยด\nเอียน\nเอี่ยน\nเอี่ยม\nเอี๊ยม\nเอี้ยมจุ๊น\nเอี้ยมเฟี้ยม\nเอี่ยว\nเอี้ยว\nเอื้อ\nเอื๊อก\nเอื้อง\nเอือด\nเอือน\nเอื้อน\nเอือม\nเอื้อม\nเอื่อย\nเอื้อย\nแอ\nแอ้\nแอ๋\nแอก\nแอกทิเนียม\nแอ่ง\nแอ้งแม้ง\nแอด\nแอ้ด\nแอ๊ด\nแอ่น\nแอนติเจน\nแอนติบอดี\nแอนติอิเล็กตรอน\nแอโนด\nแอบ\nแอม\nแอ้ม\nแอมแปร์\nแอมมิเตอร์\nแอมโมเนีย\nแอร่ม\nแอลกอฮอล์\nแอลฟา\nแอว\nแอ่ว\nแอ้วแซ่ว\nแอสทาทีน\nแอสไพริน\nแอสฟัลต์\nแอหนัง\nแออัด\nโอ\nโอ่\nโอ้\nโอ๋\nโอก\nโอ้ก\nโอ้กอ้าก\nโอ๊ก\nโอกาส\nโอฆชล\nโอฆสงสาร\nโอฆะ\nโอ่ง\nโองการ\nโองโขดง\nโอชะ\nโอชา\nโอโซน\nโอฐ\nโอด\nโอ๊ต\nโอตตัปปะ\nโอทนะ\nโอน\nโอบ\nโอปปาติกะ\nโอภา\nโอภาส\nโอม\nโอย\nโอ๊ย\nโอรส\nโอละพ่อ\nโอลิมปิก\nโอวาท\nโอษฐ์\nโอษฐชะ\nโอษฐภัย\nโอสถ\nโอห์ม\nโอหัง\nโอฬาร\nโอฬาริก\nโอฬารึก\nโอ้เอ้\nโอเอซิส\nโอ้โฮ\nไอ\nไอ้\nไอโซโทป\nไอน์สไตเนียม\nไอยรา\nไอยเรศ\nไอราพต\nไอราวัณ\nไอราวัต\nไอศกรีม\nไอศวรรย์\nไอศุริยสมบัติ\nไอศูรย์\nไอออน\nไอโอดีน\nฮกเกี้ยน\nฮด\nฮวงซุ้ย\nฮวน\nฮ้วนหมู\nฮวบ\nฮ่อ\nฮ้อ\nฮอกกี้\nฮอด\nฮ่อม\nฮ่อยจ๊อ\nฮอร์โมน\nฮอลแลนด์\nฮอลันดา\nฮะ\nฮะเบส\nฮะเรีย\nฮัก\nฮังเล\nฮัจญ์\nฮัจญะฮ์\nฮัจญี\nฮั่น\nฮั้ว\nฮา\nฮ้า\nฮ่างหลวง\nฮาจญ์\nฮาม\nฮาเร็ม\nฮาห์เนียม\nฮิจเราะห์\nฮินดู\nฮิปโปโปเตมัส\nฮิสทีเรีย\nฮีเลียม\nฮึ\nฮึก\nฮึด\nฮึดฮัด\nฮึ่ม\nฮึย\nฮึ่ย\nฮืดฮาด\nฮือ\nฮื่อ\nฮื้อ\nฮื้อฉี่\nฮุด\nฮุบ\nฮุยเลฮุย\nฮู้\nฮูก\nฮูม\nเฮ\nเฮฮา\nเฮกตาร์\nเฮกโตกรัม\nเฮกโตเมตร\nเฮกโตลิตร\nเฮง\nเฮ็ด\nเฮโมโกลบิน\nเฮย\nเฮ่ย\nเฮ้ย\nเฮโรอีน\nเฮลิคอปเตอร์\nเฮโล\nเฮละโล\nเฮ้ว\nเฮอ\nเฮ่อ\nเฮ้อ\nเฮอริเคน\nเฮิรตซ์\nเฮี้ยน\nเฮี้ยบ\nเฮี้ยว\nเฮือก\nเฮือน\nแฮ\nแฮ่\nแฮก\nแฮ่กึ๊น\nแฮนด์บอล\nแฮฟเนียม\nแฮม\nแฮะ\nโฮ\nโฮก\nโฮ่ง\nโฮ้ง\nโฮเต็ล\nโฮลเมียม\nโฮะ\nไฮ้\nไฮโกรมิเตอร์\nไฮดรา\nไฮโดร\nไฮโดรคาร์บอน\nไฮโดรเจน\nไฮโดรมิเตอร์\nไฮไฟ\nไฮโล\nไฮฮี\n"
+              .split(/[\r\n]+/)
+              .filter(function (w) {
+                return w.length > 1;
+              })
+      this.addWords(words, false)
+    }
+    if(finalize){
+      this.finalizeDict();
+    }
+  },
+
+  dictSeek: function (l, r, ch, strOffset, pos) {
+    var ans = null;
+    while (l <= r) {
+      var m = Math.floor((l + r) / 2),
+        dict_item = this.dict[m],
+        len = dict_item.length;
+      if (len <= strOffset) {
+        l = m + 1;
+      } else {
+        var ch_ = dict_item[strOffset];
+        if (ch_ < ch) {
+          l = m + 1;
+        } else if (ch_ > ch) {
+          r = m - 1;
+        } else {
+          ans = m;
+          if (pos == LEFT) {
+            r = m - 1;
+          } else {
+            l = m + 1;
+          }
+        }
+      }
+    }
+    return ans;
+  },
+
+  isFinal: function (acceptor) {
+    return this.dict[acceptor.l].length == acceptor.strOffset;
+  },
+
+  createAcceptor: function () {
+    return {
+      l: 0,
+      r: this.dict.length - 1,
+      strOffset: 0,
+      isFinal: false,
+      dict: this,
+      transit: function (ch) {
+        return this.dict.transit(this, ch);
+      },
+      isError: false,
+      tag: "DICT",
+      w: 1,
+      type: "DICT"
+    };
+  },
+
+  transit: function (acceptor, ch) {
+    var l = this.dictSeek(acceptor.l,
+      acceptor.r,
+      ch,
+      acceptor.strOffset,
+      LEFT);
+    if (l !== null) {
+      var r = this.dictSeek(l,
+        acceptor.r,
+        ch,
+        acceptor.strOffset,
+        RIGHT);
+      acceptor.l = l;
+      acceptor.r = r;
+      acceptor.strOffset++;
+      acceptor.isFinal = this.isFinal(acceptor);
+    } else {
+      acceptor.isError = true;
+    }
+    return acceptor;
+  },
+
+  sortuniq: function(a){
+    return a.sort().filter(function(item, pos, arr){
+      return !pos || item != arr[pos - 1];
+    })
+  },
+
+  flatten: function(a){
+    //[[1,2],[3]] -> [1,2,3]
+    return [].concat.apply([], a);
+  }
+};
+module.exports = WordcutDict;
+
+}).call(this,"/dist/tmp")
+},{"glob":16,"path":22}],3:[function(require,module,exports){
+var WordRule = {
+  createAcceptor: function(tag) {
+    if (tag["WORD_RULE"])
+      return null;
+
+    return {strOffset: 0,
+            isFinal: false,
+            transit: function(ch) {
+              var lch = ch.toLowerCase();
+              if (lch >= "a" && lch <= "z") {
+                this.isFinal = true;
+                this.strOffset++;
+              } else {
+                this.isError = true;
+              }
+              return this;
+            },
+            isError: false,
+            tag: "WORD_RULE",
+            type: "WORD_RULE",
+            w: 1};
+  }
+};
+
+var NumberRule = {
+  createAcceptor: function(tag) {
+    if (tag["NUMBER_RULE"])
+      return null;
+
+    return {strOffset: 0,
+            isFinal: false,
+            transit: function(ch) {
+              if (ch >= "0" && ch <= "9") {
+                this.isFinal = true;
+                this.strOffset++;
+              } else {
+                this.isError = true;
+              }
+              return this;
+            },
+            isError: false,
+            tag: "NUMBER_RULE",
+            type: "NUMBER_RULE",
+            w: 1};
+  }
+};
+
+var SpaceRule = {
+  tag: "SPACE_RULE",
+  createAcceptor: function(tag) {
+
+    if (tag["SPACE_RULE"])
+      return null;
+
+    return {strOffset: 0,
+            isFinal: false,
+            transit: function(ch) {
+              if (ch == " " || ch == "\t" || ch == "\r" || ch == "\n" ||
+                  ch == "\u00A0" || ch=="\u2003"//nbsp and emsp
+                 ) {
+                this.isFinal = true;
+                this.strOffset++;
+              } else {
+                this.isError = true;
+              }
+              return this;
+            },
+            isError: false,
+            tag: SpaceRule.tag,
+            w: 1,
+            type: "SPACE_RULE"};
+  }
+}
+
+var SingleSymbolRule = {
+  tag: "SINSYM",
+  createAcceptor: function(tag) {
+    return {strOffset: 0,
+            isFinal: false,
+            transit: function(ch) {
+              if (this.strOffset == 0 && ch.match(/^[\@\(\)\/\,\-\."`]$/)) {
+                this.isFinal = true;
+                this.strOffset++;
+              } else {
+                this.isError = true;
+              }
+              return this;
+            },
+            isError: false,
+            tag: "SINSYM",
+            w: 1,
+            type: "SINSYM"};
+  }
+}
+
+
+var LatinRules = [WordRule, SpaceRule, SingleSymbolRule, NumberRule];
+
+module.exports = LatinRules;
+
+},{}],4:[function(require,module,exports){
+var _ = require("underscore")
+  , WordcutCore = require("./wordcut_core");
+var PathInfoBuilder = {
+
+  /*
+    buildByPartAcceptors: function(path, acceptors, i) {
+    var 
+    var genInfos = partAcceptors.reduce(function(genInfos, acceptor) {
+      
+    }, []);
+    
+    return genInfos;
+  } 
+  */
+
+  buildByAcceptors: function(path, finalAcceptors, i) {
+    var self = this;
+    var infos = finalAcceptors.map(function(acceptor) {
+      var p = i - acceptor.strOffset + 1
+        , _info = path[p];            
+      
+      var info = {p: p, 
+                  mw: _info.mw + (acceptor.mw === undefined ? 0 : acceptor.mw),
+                  w: acceptor.w + _info.w,
+                  unk: (acceptor.unk ? acceptor.unk : 0) + _info.unk, 
+                  type: acceptor.type};
+
+      if (acceptor.type == "PART") {
+        for(var j = p + 1; j <= i; j++) {
+          path[j].merge = p;
+        }
+        info.merge = p;
+      }
+
+      return info;
+    });
+    return infos.filter(function(info) { return info; });
+  },
+  
+  fallback: function(path, leftBoundary, text, i) {
+    var _info = path[leftBoundary];
+    if (text[i].match(/[\u0E48-\u0E4E]/)) {
+      if (leftBoundary != 0) 
+        leftBoundary = path[leftBoundary].p;
+      return {p: leftBoundary,
+              mw: 0,
+              w: 1 + _info.w,
+              unk: 1 + _info.unk,
+              type: "UNK"};      
+/*    } else if(leftBoundary > 0 && path[leftBoundary].type !== "UNK") {
+      leftBoundary = path[leftBoundary].p;
+      return {p: leftBoundary,
+              w: 1 + _info.w,
+              unk: 1 + _info.unk,
+              type: "UNK"};            */
+    } else {      
+      return {p: leftBoundary,
+              mw: _info.mw,
+              w: 1 + _info.w,
+              unk: 1 + _info.unk,
+              type: "UNK"};
+    }
+  },
+  
+  build: function(path, finalAcceptors, i, leftBoundary, text) {
+    var basicPathInfos = this.buildByAcceptors(path, finalAcceptors, i);
+    if (basicPathInfos.length > 0) {
+      return basicPathInfos;
+    } else {
+      return [this.fallback(path, leftBoundary, text, i)];
+    }
+  }
+};
+
+module.exports = function() {
+  return _.clone(PathInfoBuilder);
+}
+
+},{"./wordcut_core":8,"underscore":25}],5:[function(require,module,exports){
+var _ = require("underscore");
+
+
+var PathSelector = {
+  selectPath: function(paths) {
+    var path = paths.reduce(function(selectedPath, path) {
+      if (selectedPath == null) {        
+        return path;
+      } else {
+        if (path.unk < selectedPath.unk) 
+          return path;        
+        if (path.unk == selectedPath.unk) {
+          if (path.mw < selectedPath.mw)
+            return path
+          if (path.mw == selectedPath.mw) {
+            if (path.w < selectedPath.w) 
+              return path;
+          }
+        }
+        return selectedPath;
+      }
+    }, null);
+    return path;
+  },
+  
+  createPath: function() {
+    return [{p:null, w:0, unk:0, type: "INIT", mw:0}];
+  }
+};
+
+module.exports = function() {
+  return _.clone(PathSelector);
+};
+
+},{"underscore":25}],6:[function(require,module,exports){
+function isMatch(pat, offset, ch) {
+  if (pat.length <= offset)
+    return false;
+  var _ch = pat[offset];
+  return _ch == ch ||
+         (_ch.match(/[กข]/) && ch.match(/[ก-ฮ]/)) ||
+         (_ch.match(/[มบ]/) && ch.match(/[ก-ฮ]/)) ||
+         (_ch.match(/\u0E49/) && ch.match(/[\u0E48-\u0E4B]/));
+}
+
+var Rule0 = {
+  pat: "เหก็ม",
+  createAcceptor: function(tag) {
+    return {strOffset: 0,
+            isFinal: false,
+            transit: function(ch) {
+              if (isMatch(Rule0.pat, this.strOffset,ch)) {                 
+                this.isFinal = (this.strOffset + 1 == Rule0.pat.length);                
+                this.strOffset++;
+              } else {              
+                this.isError = true;             
+              }
+              return this;
+            },
+            isError: false,
+            tag: "THAI_RULE",
+            type: "THAI_RULE", 
+            w: 1};                        
+  }
+};
+
+var PartRule = {
+  createAcceptor: function(tag) {
+    return {strOffset: 0,
+            patterns: [
+              "แก", "เก", "ก้", "กก์", "กา", "กี", "กิ", "กืก"  
+            ],
+            isFinal: false,
+            transit: function(ch) {
+              var offset = this.strOffset;
+              this.patterns = this.patterns.filter(function(pat) {
+                return isMatch(pat, offset, ch);
+              });
+              
+              if (this.patterns.length > 0) {
+                var len = 1 + offset;
+                this.isFinal = this.patterns.some(function(pat) {
+                  return pat.length == len; 
+                });
+                this.strOffset++;
+              } else {              
+                this.isError = true;             
+              }
+              return this;
+            },
+            isError: false,
+            tag: "PART",
+            type: "PART", 
+            unk: 1,
+            w: 1};                        
+  }
+};
+
+var ThaiRules = [Rule0, PartRule];
+
+module.exports = ThaiRules;
+
+},{}],7:[function(require,module,exports){
+var sys = require("sys")
+  , WordcutDict = require("./dict")
+  , WordcutCore = require("./wordcut_core")
+  , PathInfoBuilder = require("./path_info_builder")
+  , PathSelector = require("./path_selector")
+  , Acceptors = require("./acceptors")
+  , latinRules = require("./latin_rules")
+  , thaiRules = require("./thai_rules")
+  , _ = require("underscore");
+
+
+var Wordcut = Object.create(WordcutCore);
+Wordcut.defaultPathInfoBuilder = PathInfoBuilder;
+Wordcut.defaultPathSelector = PathSelector;
+Wordcut.defaultAcceptors = Acceptors;
+Wordcut.defaultLatinRules = latinRules;
+Wordcut.defaultThaiRules = thaiRules;
+Wordcut.defaultDict = WordcutDict;
+
+
+Wordcut.initNoDict = function(dict_path) {
+  var self = this;
+  self.pathInfoBuilder = new self.defaultPathInfoBuilder;
+  self.pathSelector = new self.defaultPathSelector;
+  self.acceptors = new self.defaultAcceptors;
+  self.defaultLatinRules.forEach(function(rule) {
+    self.acceptors.creators.push(rule);
+  });
+  self.defaultThaiRules.forEach(function(rule) {
+    self.acceptors.creators.push(rule);
+  });
+};
+
+Wordcut.init = function(dict_path, withDefault, additionalWords) {
+  withDefault = withDefault || false;
+  this.initNoDict();
+  var dict = _.clone(this.defaultDict);
+  dict.init(dict_path, withDefault, additionalWords);
+  this.acceptors.creators.push(dict);
+};
+
+module.exports = Wordcut;
+
+},{"./acceptors":1,"./dict":2,"./latin_rules":3,"./path_info_builder":4,"./path_selector":5,"./thai_rules":6,"./wordcut_core":8,"sys":28,"underscore":25}],8:[function(require,module,exports){
+var WordcutCore = {
+
+  buildPath: function(text) {
+    var self = this
+      , path = self.pathSelector.createPath()
+      , leftBoundary = 0;
+    self.acceptors.reset();
+    for (var i = 0; i < text.length; i++) {
+      var ch = text[i];
+      self.acceptors.transit(ch);
+
+      var possiblePathInfos = self
+        .pathInfoBuilder
+        .build(path,
+               self.acceptors.getFinalAcceptors(),
+               i,
+               leftBoundary,
+               text);
+      var selectedPath = self.pathSelector.selectPath(possiblePathInfos)
+
+      path.push(selectedPath);
+      if (selectedPath.type !== "UNK") {
+        leftBoundary = i;
+      }
+    }
+    return path;
+  },
+
+  pathToRanges: function(path) {
+    var e = path.length - 1
+     , ranges = [];
+
+    while (e > 0) {
+      var info = path[e]
+       , s = info.p;
+
+      if (info.merge !== undefined && ranges.length > 0) {
+        var r = ranges[ranges.length - 1];
+        r.s = info.merge;
+        s = r.s;
+      } else {
+        ranges.push({s:s, e:e});
+      }
+      e = s;
+    }
+    return ranges.reverse();
+  },
+
+  rangesToText: function(text, ranges, delimiter) {
+    return ranges.map(function(r) {
+      return text.substring(r.s, r.e);
+    }).join(delimiter);
+  },
+
+  cut: function(text, delimiter) {
+    var path = this.buildPath(text)
+      , ranges = this.pathToRanges(path);
+    return this
+      .rangesToText(text, ranges,
+                    (delimiter === undefined ? "|" : delimiter));
+  },
+
+  cutIntoRanges: function(text, noText) {
+    var path = this.buildPath(text)
+      , ranges = this.pathToRanges(path);
+
+    if (!noText) {
+      ranges.forEach(function(r) {
+        r.text = text.substring(r.s, r.e);
+      });
+    }
+    return ranges;
+  },
+
+  cutIntoArray: function(text) {
+    var path = this.buildPath(text)
+      , ranges = this.pathToRanges(path);
+    
+    return ranges.map(function(r) {
+      return text.substring(r.s, r.e)
+    });
+  }
+};
+
+module.exports = WordcutCore;
+
+},{}],9:[function(require,module,exports){
+// http://wiki.commonjs.org/wiki/Unit_Testing/1.0
+//
+// THIS IS NOT TESTED NOR LIKELY TO WORK OUTSIDE V8!
+//
+// Originally from narwhal.js (http://narwhaljs.org)
+// Copyright (c) 2009 Thomas Robinson <280north.com>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the 'Software'), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+// when used in node, this will actually load the util module we depend on
+// versus loading the builtin util module as happens otherwise
+// this is a bug in node module loading as far as I am concerned
+var util = require('util/');
+
+var pSlice = Array.prototype.slice;
+var hasOwn = Object.prototype.hasOwnProperty;
+
+// 1. The assert module provides functions that throw
+// AssertionError's when particular conditions are not met. The
+// assert module must conform to the following interface.
+
+var assert = module.exports = ok;
+
+// 2. The AssertionError is defined in assert.
+// new assert.AssertionError({ message: message,
+//                             actual: actual,
+//                             expected: expected })
+
+assert.AssertionError = function AssertionError(options) {
+  this.name = 'AssertionError';
+  this.actual = options.actual;
+  this.expected = options.expected;
+  this.operator = options.operator;
+  if (options.message) {
+    this.message = options.message;
+    this.generatedMessage = false;
+  } else {
+    this.message = getMessage(this);
+    this.generatedMessage = true;
+  }
+  var stackStartFunction = options.stackStartFunction || fail;
+
+  if (Error.captureStackTrace) {
+    Error.captureStackTrace(this, stackStartFunction);
+  }
+  else {
+    // non v8 browsers so we can have a stacktrace
+    var err = new Error();
+    if (err.stack) {
+      var out = err.stack;
+
+      // try to strip useless frames
+      var fn_name = stackStartFunction.name;
+      var idx = out.indexOf('\n' + fn_name);
+      if (idx >= 0) {
+        // once we have located the function frame
+        // we need to strip out everything before it (and its line)
+        var next_line = out.indexOf('\n', idx + 1);
+        out = out.substring(next_line + 1);
+      }
+
+      this.stack = out;
+    }
+  }
+};
+
+// assert.AssertionError instanceof Error
+util.inherits(assert.AssertionError, Error);
+
+function replacer(key, value) {
+  if (util.isUndefined(value)) {
+    return '' + value;
+  }
+  if (util.isNumber(value) && !isFinite(value)) {
+    return value.toString();
+  }
+  if (util.isFunction(value) || util.isRegExp(value)) {
+    return value.toString();
+  }
+  return value;
+}
+
+function truncate(s, n) {
+  if (util.isString(s)) {
+    return s.length < n ? s : s.slice(0, n);
+  } else {
+    return s;
+  }
+}
+
+function getMessage(self) {
+  return truncate(JSON.stringify(self.actual, replacer), 128) + ' ' +
+         self.operator + ' ' +
+         truncate(JSON.stringify(self.expected, replacer), 128);
+}
+
+// At present only the three keys mentioned above are used and
+// understood by the spec. Implementations or sub modules can pass
+// other keys to the AssertionError's constructor - they will be
+// ignored.
+
+// 3. All of the following functions must throw an AssertionError
+// when a corresponding condition is not met, with a message that
+// may be undefined if not provided.  All assertion methods provide
+// both the actual and expected values to the assertion error for
+// display purposes.
+
+function fail(actual, expected, message, operator, stackStartFunction) {
+  throw new assert.AssertionError({
+    message: message,
+    actual: actual,
+    expected: expected,
+    operator: operator,
+    stackStartFunction: stackStartFunction
+  });
+}
+
+// EXTENSION! allows for well behaved errors defined elsewhere.
+assert.fail = fail;
+
+// 4. Pure assertion tests whether a value is truthy, as determined
+// by !!guard.
+// assert.ok(guard, message_opt);
+// This statement is equivalent to assert.equal(true, !!guard,
+// message_opt);. To test strictly for the value true, use
+// assert.strictEqual(true, guard, message_opt);.
+
+function ok(value, message) {
+  if (!value) fail(value, true, message, '==', assert.ok);
+}
+assert.ok = ok;
+
+// 5. The equality assertion tests shallow, coercive equality with
+// ==.
+// assert.equal(actual, expected, message_opt);
+
+assert.equal = function equal(actual, expected, message) {
+  if (actual != expected) fail(actual, expected, message, '==', assert.equal);
+};
+
+// 6. The non-equality assertion tests for whether two objects are not equal
+// with != assert.notEqual(actual, expected, message_opt);
+
+assert.notEqual = function notEqual(actual, expected, message) {
+  if (actual == expected) {
+    fail(actual, expected, message, '!=', assert.notEqual);
+  }
+};
+
+// 7. The equivalence assertion tests a deep equality relation.
+// assert.deepEqual(actual, expected, message_opt);
+
+assert.deepEqual = function deepEqual(actual, expected, message) {
+  if (!_deepEqual(actual, expected)) {
+    fail(actual, expected, message, 'deepEqual', assert.deepEqual);
+  }
+};
+
+function _deepEqual(actual, expected) {
+  // 7.1. All identical values are equivalent, as determined by ===.
+  if (actual === expected) {
+    return true;
+
+  } else if (util.isBuffer(actual) && util.isBuffer(expected)) {
+    if (actual.length != expected.length) return false;
+
+    for (var i = 0; i < actual.length; i++) {
+      if (actual[i] !== expected[i]) return false;
+    }
+
+    return true;
+
+  // 7.2. If the expected value is a Date object, the actual value is
+  // equivalent if it is also a Date object that refers to the same time.
+  } else if (util.isDate(actual) && util.isDate(expected)) {
+    return actual.getTime() === expected.getTime();
+
+  // 7.3 If the expected value is a RegExp object, the actual value is
+  // equivalent if it is also a RegExp object with the same source and
+  // properties (`global`, `multiline`, `lastIndex`, `ignoreCase`).
+  } else if (util.isRegExp(actual) && util.isRegExp(expected)) {
+    return actual.source === expected.source &&
+           actual.global === expected.global &&
+           actual.multiline === expected.multiline &&
+           actual.lastIndex === expected.lastIndex &&
+           actual.ignoreCase === expected.ignoreCase;
+
+  // 7.4. Other pairs that do not both pass typeof value == 'object',
+  // equivalence is determined by ==.
+  } else if (!util.isObject(actual) && !util.isObject(expected)) {
+    return actual == expected;
+
+  // 7.5 For all other Object pairs, including Array objects, equivalence is
+  // determined by having the same number of owned properties (as verified
+  // with Object.prototype.hasOwnProperty.call), the same set of keys
+  // (although not necessarily the same order), equivalent values for every
+  // corresponding key, and an identical 'prototype' property. Note: this
+  // accounts for both named and indexed properties on Arrays.
+  } else {
+    return objEquiv(actual, expected);
+  }
+}
+
+function isArguments(object) {
+  return Object.prototype.toString.call(object) == '[object Arguments]';
+}
+
+function objEquiv(a, b) {
+  if (util.isNullOrUndefined(a) || util.isNullOrUndefined(b))
+    return false;
+  // an identical 'prototype' property.
+  if (a.prototype !== b.prototype) return false;
+  // if one is a primitive, the other must be same
+  if (util.isPrimitive(a) || util.isPrimitive(b)) {
+    return a === b;
+  }
+  var aIsArgs = isArguments(a),
+      bIsArgs = isArguments(b);
+  if ((aIsArgs && !bIsArgs) || (!aIsArgs && bIsArgs))
+    return false;
+  if (aIsArgs) {
+    a = pSlice.call(a);
+    b = pSlice.call(b);
+    return _deepEqual(a, b);
+  }
+  var ka = objectKeys(a),
+      kb = objectKeys(b),
+      key, i;
+  // having the same number of owned properties (keys incorporates
+  // hasOwnProperty)
+  if (ka.length != kb.length)
+    return false;
+  //the same set of keys (although not necessarily the same order),
+  ka.sort();
+  kb.sort();
+  //~~~cheap key test
+  for (i = ka.length - 1; i >= 0; i--) {
+    if (ka[i] != kb[i])
+      return false;
+  }
+  //equivalent values for every corresponding key, and
+  //~~~possibly expensive deep test
+  for (i = ka.length - 1; i >= 0; i--) {
+    key = ka[i];
+    if (!_deepEqual(a[key], b[key])) return false;
+  }
+  return true;
+}
+
+// 8. The non-equivalence assertion tests for any deep inequality.
+// assert.notDeepEqual(actual, expected, message_opt);
+
+assert.notDeepEqual = function notDeepEqual(actual, expected, message) {
+  if (_deepEqual(actual, expected)) {
+    fail(actual, expected, message, 'notDeepEqual', assert.notDeepEqual);
+  }
+};
+
+// 9. The strict equality assertion tests strict equality, as determined by ===.
+// assert.strictEqual(actual, expected, message_opt);
+
+assert.strictEqual = function strictEqual(actual, expected, message) {
+  if (actual !== expected) {
+    fail(actual, expected, message, '===', assert.strictEqual);
+  }
+};
+
+// 10. The strict non-equality assertion tests for strict inequality, as
+// determined by !==.  assert.notStrictEqual(actual, expected, message_opt);
+
+assert.notStrictEqual = function notStrictEqual(actual, expected, message) {
+  if (actual === expected) {
+    fail(actual, expected, message, '!==', assert.notStrictEqual);
+  }
+};
+
+function expectedException(actual, expected) {
+  if (!actual || !expected) {
+    return false;
+  }
+
+  if (Object.prototype.toString.call(expected) == '[object RegExp]') {
+    return expected.test(actual);
+  } else if (actual instanceof expected) {
+    return true;
+  } else if (expected.call({}, actual) === true) {
+    return true;
+  }
+
+  return false;
+}
+
+function _throws(shouldThrow, block, expected, message) {
+  var actual;
+
+  if (util.isString(expected)) {
+    message = expected;
+    expected = null;
+  }
+
+  try {
+    block();
+  } catch (e) {
+    actual = e;
+  }
+
+  message = (expected && expected.name ? ' (' + expected.name + ').' : '.') +
+            (message ? ' ' + message : '.');
+
+  if (shouldThrow && !actual) {
+    fail(actual, expected, 'Missing expected exception' + message);
+  }
+
+  if (!shouldThrow && expectedException(actual, expected)) {
+    fail(actual, expected, 'Got unwanted exception' + message);
+  }
+
+  if ((shouldThrow && actual && expected &&
+      !expectedException(actual, expected)) || (!shouldThrow && actual)) {
+    throw actual;
+  }
+}
+
+// 11. Expected to throw an error:
+// assert.throws(block, Error_opt, message_opt);
+
+assert.throws = function(block, /*optional*/error, /*optional*/message) {
+  _throws.apply(this, [true].concat(pSlice.call(arguments)));
+};
+
+// EXTENSION! This is annoying to write outside this module.
+assert.doesNotThrow = function(block, /*optional*/message) {
+  _throws.apply(this, [false].concat(pSlice.call(arguments)));
+};
+
+assert.ifError = function(err) { if (err) {throw err;}};
+
+var objectKeys = Object.keys || function (obj) {
+  var keys = [];
+  for (var key in obj) {
+    if (hasOwn.call(obj, key)) keys.push(key);
+  }
+  return keys;
+};
+
+},{"util/":28}],10:[function(require,module,exports){
+'use strict';
+module.exports = balanced;
+function balanced(a, b, str) {
+  if (a instanceof RegExp) a = maybeMatch(a, str);
+  if (b instanceof RegExp) b = maybeMatch(b, str);
+
+  var r = range(a, b, str);
+
+  return r && {
+    start: r[0],
+    end: r[1],
+    pre: str.slice(0, r[0]),
+    body: str.slice(r[0] + a.length, r[1]),
+    post: str.slice(r[1] + b.length)
+  };
+}
+
+function maybeMatch(reg, str) {
+  var m = str.match(reg);
+  return m ? m[0] : null;
+}
+
+balanced.range = range;
+function range(a, b, str) {
+  var begs, beg, left, right, result;
+  var ai = str.indexOf(a);
+  var bi = str.indexOf(b, ai + 1);
+  var i = ai;
+
+  if (ai >= 0 && bi > 0) {
+    begs = [];
+    left = str.length;
+
+    while (i >= 0 && !result) {
+      if (i == ai) {
+        begs.push(i);
+        ai = str.indexOf(a, i + 1);
+      } else if (begs.length == 1) {
+        result = [ begs.pop(), bi ];
+      } else {
+        beg = begs.pop();
+        if (beg < left) {
+          left = beg;
+          right = bi;
+        }
+
+        bi = str.indexOf(b, i + 1);
+      }
+
+      i = ai < bi && ai >= 0 ? ai : bi;
+    }
+
+    if (begs.length) {
+      result = [ left, right ];
+    }
+  }
+
+  return result;
+}
+
+},{}],11:[function(require,module,exports){
+var concatMap = require('concat-map');
+var balanced = require('balanced-match');
+
+module.exports = expandTop;
+
+var escSlash = '\0SLASH'+Math.random()+'\0';
+var escOpen = '\0OPEN'+Math.random()+'\0';
+var escClose = '\0CLOSE'+Math.random()+'\0';
+var escComma = '\0COMMA'+Math.random()+'\0';
+var escPeriod = '\0PERIOD'+Math.random()+'\0';
+
+function numeric(str) {
+  return parseInt(str, 10) == str
+    ? parseInt(str, 10)
+    : str.charCodeAt(0);
+}
+
+function escapeBraces(str) {
+  return str.split('\\\\').join(escSlash)
+            .split('\\{').join(escOpen)
+            .split('\\}').join(escClose)
+            .split('\\,').join(escComma)
+            .split('\\.').join(escPeriod);
+}
+
+function unescapeBraces(str) {
+  return str.split(escSlash).join('\\')
+            .split(escOpen).join('{')
+            .split(escClose).join('}')
+            .split(escComma).join(',')
+            .split(escPeriod).join('.');
+}
+
+
+// Basically just str.split(","), but handling cases
+// where we have nested braced sections, which should be
+// treated as individual members, like {a,{b,c},d}
+function parseCommaParts(str) {
+  if (!str)
+    return [''];
+
+  var parts = [];
+  var m = balanced('{', '}', str);
+
+  if (!m)
+    return str.split(',');
+
+  var pre = m.pre;
+  var body = m.body;
+  var post = m.post;
+  var p = pre.split(',');
+
+  p[p.length-1] += '{' + body + '}';
+  var postParts = parseCommaParts(post);
+  if (post.length) {
+    p[p.length-1] += postParts.shift();
+    p.push.apply(p, postParts);
+  }
+
+  parts.push.apply(parts, p);
+
+  return parts;
+}
+
+function expandTop(str) {
+  if (!str)
+    return [];
+
+  // I don't know why Bash 4.3 does this, but it does.
+  // Anything starting with {} will have the first two bytes preserved
+  // but *only* at the top level, so {},a}b will not expand to anything,
+  // but a{},b}c will be expanded to [a}c,abc].
+  // One could argue that this is a bug in Bash, but since the goal of
+  // this module is to match Bash's rules, we escape a leading {}
+  if (str.substr(0, 2) === '{}') {
+    str = '\\{\\}' + str.substr(2);
+  }
+
+  return expand(escapeBraces(str), true).map(unescapeBraces);
+}
+
+function identity(e) {
+  return e;
+}
+
+function embrace(str) {
+  return '{' + str + '}';
+}
+function isPadded(el) {
+  return /^-?0\d/.test(el);
+}
+
+function lte(i, y) {
+  return i <= y;
+}
+function gte(i, y) {
+  return i >= y;
+}
+
+function expand(str, isTop) {
+  var expansions = [];
+
+  var m = balanced('{', '}', str);
+  if (!m || /\$$/.test(m.pre)) return [str];
+
+  var isNumericSequence = /^-?\d+\.\.-?\d+(?:\.\.-?\d+)?$/.test(m.body);
+  var isAlphaSequence = /^[a-zA-Z]\.\.[a-zA-Z](?:\.\.-?\d+)?$/.test(m.body);
+  var isSequence = isNumericSequence || isAlphaSequence;
+  var isOptions = m.body.indexOf(',') >= 0;
+  if (!isSequence && !isOptions) {
+    // {a},b}
+    if (m.post.match(/,.*\}/)) {
+      str = m.pre + '{' + m.body + escClose + m.post;
+      return expand(str);
+    }
+    return [str];
+  }
+
+  var n;
+  if (isSequence) {
+    n = m.body.split(/\.\./);
+  } else {
+    n = parseCommaParts(m.body);
+    if (n.length === 1) {
+      // x{{a,b}}y ==> x{a}y x{b}y
+      n = expand(n[0], false).map(embrace);
+      if (n.length === 1) {
+        var post = m.post.length
+          ? expand(m.post, false)
+          : [''];
+        return post.map(function(p) {
+          return m.pre + n[0] + p;
+        });
+      }
+    }
+  }
+
+  // at this point, n is the parts, and we know it's not a comma set
+  // with a single entry.
+
+  // no need to expand pre, since it is guaranteed to be free of brace-sets
+  var pre = m.pre;
+  var post = m.post.length
+    ? expand(m.post, false)
+    : [''];
+
+  var N;
+
+  if (isSequence) {
+    var x = numeric(n[0]);
+    var y = numeric(n[1]);
+    var width = Math.max(n[0].length, n[1].length)
+    var incr = n.length == 3
+      ? Math.abs(numeric(n[2]))
+      : 1;
+    var test = lte;
+    var reverse = y < x;
+    if (reverse) {
+      incr *= -1;
+      test = gte;
+    }
+    var pad = n.some(isPadded);
+
+    N = [];
+
+    for (var i = x; test(i, y); i += incr) {
+      var c;
+      if (isAlphaSequence) {
+        c = String.fromCharCode(i);
+        if (c === '\\')
+          c = '';
+      } else {
+        c = String(i);
+        if (pad) {
+          var need = width - c.length;
+          if (need > 0) {
+            var z = new Array(need + 1).join('0');
+            if (i < 0)
+              c = '-' + z + c.slice(1);
+            else
+              c = z + c;
+          }
+        }
+      }
+      N.push(c);
+    }
+  } else {
+    N = concatMap(n, function(el) { return expand(el, false) });
+  }
+
+  for (var j = 0; j < N.length; j++) {
+    for (var k = 0; k < post.length; k++) {
+      var expansion = pre + N[j] + post[k];
+      if (!isTop || isSequence || expansion)
+        expansions.push(expansion);
+    }
+  }
+
+  return expansions;
+}
+
+
+},{"balanced-match":10,"concat-map":13}],12:[function(require,module,exports){
+
+},{}],13:[function(require,module,exports){
+module.exports = function (xs, fn) {
+    var res = [];
+    for (var i = 0; i < xs.length; i++) {
+        var x = fn(xs[i], i);
+        if (isArray(x)) res.push.apply(res, x);
+        else res.push(x);
+    }
+    return res;
+};
+
+var isArray = Array.isArray || function (xs) {
+    return Object.prototype.toString.call(xs) === '[object Array]';
+};
+
+},{}],14:[function(require,module,exports){
+// Copyright Joyent, Inc. and other Node contributors.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to permit
+// persons to whom the Software is furnished to do so, subject to the
+// following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
+// NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+// USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+function EventEmitter() {
+  this._events = this._events || {};
+  this._maxListeners = this._maxListeners || undefined;
+}
+module.exports = EventEmitter;
+
+// Backwards-compat with node 0.10.x
+EventEmitter.EventEmitter = EventEmitter;
+
+EventEmitter.prototype._events = undefined;
+EventEmitter.prototype._maxListeners = undefined;
+
+// By default EventEmitters will print a warning if more than 10 listeners are
+// added to it. This is a useful default which helps finding memory leaks.
+EventEmitter.defaultMaxListeners = 10;
+
+// Obviously not all Emitters should be limited to 10. This function allows
+// that to be increased. Set to zero for unlimited.
+EventEmitter.prototype.setMaxListeners = function(n) {
+  if (!isNumber(n) || n < 0 || isNaN(n))
+    throw TypeError('n must be a positive number');
+  this._maxListeners = n;
+  return this;
+};
+
+EventEmitter.prototype.emit = function(type) {
+  var er, handler, len, args, i, listeners;
+
+  if (!this._events)
+    this._events = {};
+
+  // If there is no 'error' event listener then throw.
+  if (type === 'error') {
+    if (!this._events.error ||
+        (isObject(this._events.error) && !this._events.error.length)) {
+      er = arguments[1];
+      if (er instanceof Error) {
+        throw er; // Unhandled 'error' event
+      }
+      throw TypeError('Uncaught, unspecified "error" event.');
+    }
+  }
+
+  handler = this._events[type];
+
+  if (isUndefined(handler))
+    return false;
+
+  if (isFunction(handler)) {
+    switch (arguments.length) {
+      // fast cases
+      case 1:
+        handler.call(this);
+        break;
+      case 2:
+        handler.call(this, arguments[1]);
+        break;
+      case 3:
+        handler.call(this, arguments[1], arguments[2]);
+        break;
+      // slower
+      default:
+        len = arguments.length;
+        args = new Array(len - 1);
+        for (i = 1; i < len; i++)
+          args[i - 1] = arguments[i];
+        handler.apply(this, args);
+    }
+  } else if (isObject(handler)) {
+    len = arguments.length;
+    args = new Array(len - 1);
+    for (i = 1; i < len; i++)
+      args[i - 1] = arguments[i];
+
+    listeners = handler.slice();
+    len = listeners.length;
+    for (i = 0; i < len; i++)
+      listeners[i].apply(this, args);
+  }
+
+  return true;
+};
+
+EventEmitter.prototype.addListener = function(type, listener) {
+  var m;
+
+  if (!isFunction(listener))
+    throw TypeError('listener must be a function');
+
+  if (!this._events)
+    this._events = {};
+
+  // To avoid recursion in the case that type === "newListener"! Before
+  // adding it to the listeners, first emit "newListener".
+  if (this._events.newListener)
+    this.emit('newListener', type,
+              isFunction(listener.listener) ?
+              listener.listener : listener);
+
+  if (!this._events[type])
+    // Optimize the case of one listener. Don't need the extra array object.
+    this._events[type] = listener;
+  else if (isObject(this._events[type]))
+    // If we've already got an array, just append.
+    this._events[type].push(listener);
+  else
+    // Adding the second element, need to change to array.
+    this._events[type] = [this._events[type], listener];
+
+  // Check for listener leak
+  if (isObject(this._events[type]) && !this._events[type].warned) {
+    var m;
+    if (!isUndefined(this._maxListeners)) {
+      m = this._maxListeners;
+    } else {
+      m = EventEmitter.defaultMaxListeners;
+    }
+
+    if (m && m > 0 && this._events[type].length > m) {
+      this._events[type].warned = true;
+      console.error('(node) warning: possible EventEmitter memory ' +
+                    'leak detected. %d listeners added. ' +
+                    'Use emitter.setMaxListeners() to increase limit.',
+                    this._events[type].length);
+      if (typeof console.trace === 'function') {
+        // not supported in IE 10
+        console.trace();
+      }
+    }
+  }
+
+  return this;
+};
+
+EventEmitter.prototype.on = EventEmitter.prototype.addListener;
+
+EventEmitter.prototype.once = function(type, listener) {
+  if (!isFunction(listener))
+    throw TypeError('listener must be a function');
+
+  var fired = false;
+
+  function g() {
+    this.removeListener(type, g);
+
+    if (!fired) {
+      fired = true;
+      listener.apply(this, arguments);
+    }
+  }
+
+  g.listener = listener;
+  this.on(type, g);
+
+  return this;
+};
+
+// emits a 'removeListener' event iff the listener was removed
+EventEmitter.prototype.removeListener = function(type, listener) {
+  var list, position, length, i;
+
+  if (!isFunction(listener))
+    throw TypeError('listener must be a function');
+
+  if (!this._events || !this._events[type])
+    return this;
+
+  list = this._events[type];
+  length = list.length;
+  position = -1;
+
+  if (list === listener ||
+      (isFunction(list.listener) && list.listener === listener)) {
+    delete this._events[type];
+    if (this._events.removeListener)
+      this.emit('removeListener', type, listener);
+
+  } else if (isObject(list)) {
+    for (i = length; i-- > 0;) {
+      if (list[i] === listener ||
+          (list[i].listener && list[i].listener === listener)) {
+        position = i;
+        break;
+      }
+    }
+
+    if (position < 0)
+      return this;
+
+    if (list.length === 1) {
+      list.length = 0;
+      delete this._events[type];
+    } else {
+      list.splice(position, 1);
+    }
+
+    if (this._events.removeListener)
+      this.emit('removeListener', type, listener);
+  }
+
+  return this;
+};
+
+EventEmitter.prototype.removeAllListeners = function(type) {
+  var key, listeners;
+
+  if (!this._events)
+    return this;
+
+  // not listening for removeListener, no need to emit
+  if (!this._events.removeListener) {
+    if (arguments.length === 0)
+      this._events = {};
+    else if (this._events[type])
+      delete this._events[type];
+    return this;
+  }
+
+  // emit removeListener for all listeners on all events
+  if (arguments.length === 0) {
+    for (key in this._events) {
+      if (key === 'removeListener') continue;
+      this.removeAllListeners(key);
+    }
+    this.removeAllListeners('removeListener');
+    this._events = {};
+    return this;
+  }
+
+  listeners = this._events[type];
+
+  if (isFunction(listeners)) {
+    this.removeListener(type, listeners);
+  } else {
+    // LIFO order
+    while (listeners.length)
+      this.removeListener(type, listeners[listeners.length - 1]);
+  }
+  delete this._events[type];
+
+  return this;
+};
+
+EventEmitter.prototype.listeners = function(type) {
+  var ret;
+  if (!this._events || !this._events[type])
+    ret = [];
+  else if (isFunction(this._events[type]))
+    ret = [this._events[type]];
+  else
+    ret = this._events[type].slice();
+  return ret;
+};
+
+EventEmitter.listenerCount = function(emitter, type) {
+  var ret;
+  if (!emitter._events || !emitter._events[type])
+    ret = 0;
+  else if (isFunction(emitter._events[type]))
+    ret = 1;
+  else
+    ret = emitter._events[type].length;
+  return ret;
+};
+
+function isFunction(arg) {
+  return typeof arg === 'function';
+}
+
+function isNumber(arg) {
+  return typeof arg === 'number';
+}
+
+function isObject(arg) {
+  return typeof arg === 'object' && arg !== null;
+}
+
+function isUndefined(arg) {
+  return arg === void 0;
+}
+
+},{}],15:[function(require,module,exports){
+(function (process){
+exports.alphasort = alphasort
+exports.alphasorti = alphasorti
+exports.setopts = setopts
+exports.ownProp = ownProp
+exports.makeAbs = makeAbs
+exports.finish = finish
+exports.mark = mark
+exports.isIgnored = isIgnored
+exports.childrenIgnored = childrenIgnored
+
+function ownProp (obj, field) {
+  return Object.prototype.hasOwnProperty.call(obj, field)
+}
+
+var path = require("path")
+var minimatch = require("minimatch")
+var isAbsolute = require("path-is-absolute")
+var Minimatch = minimatch.Minimatch
+
+function alphasorti (a, b) {
+  return a.toLowerCase().localeCompare(b.toLowerCase())
+}
+
+function alphasort (a, b) {
+  return a.localeCompare(b)
+}
+
+function setupIgnores (self, options) {
+  self.ignore = options.ignore || []
+
+  if (!Array.isArray(self.ignore))
+    self.ignore = [self.ignore]
+
+  if (self.ignore.length) {
+    self.ignore = self.ignore.map(ignoreMap)
+  }
+}
+
+function ignoreMap (pattern) {
+  var gmatcher = null
+  if (pattern.slice(-3) === '/**') {
+    var gpattern = pattern.replace(/(\/\*\*)+$/, '')
+    gmatcher = new Minimatch(gpattern)
+  }
+
+  return {
+    matcher: new Minimatch(pattern),
+    gmatcher: gmatcher
+  }
+}
+
+function setopts (self, pattern, options) {
+  if (!options)
+    options = {}
+
+  // base-matching: just use globstar for that.
+  if (options.matchBase && -1 === pattern.indexOf("/")) {
+    if (options.noglobstar) {
+      throw new Error("base matching requires globstar")
+    }
+    pattern = "**/" + pattern
+  }
+
+  self.silent = !!options.silent
+  self.pattern = pattern
+  self.strict = options.strict !== false
+  self.realpath = !!options.realpath
+  self.realpathCache = options.realpathCache || Object.create(null)
+  self.follow = !!options.follow
+  self.dot = !!options.dot
+  self.mark = !!options.mark
+  self.nodir = !!options.nodir
+  if (self.nodir)
+    self.mark = true
+  self.sync = !!options.sync
+  self.nounique = !!options.nounique
+  self.nonull = !!options.nonull
+  self.nosort = !!options.nosort
+  self.nocase = !!options.nocase
+  self.stat = !!options.stat
+  self.noprocess = !!options.noprocess
+
+  self.maxLength = options.maxLength || Infinity
+  self.cache = options.cache || Object.create(null)
+  self.statCache = options.statCache || Object.create(null)
+  self.symlinks = options.symlinks || Object.create(null)
+
+  setupIgnores(self, options)
+
+  self.changedCwd = false
+  var cwd = process.cwd()
+  if (!ownProp(options, "cwd"))
+    self.cwd = cwd
+  else {
+    self.cwd = options.cwd
+    self.changedCwd = path.resolve(options.cwd) !== cwd
+  }
+
+  self.root = options.root || path.resolve(self.cwd, "/")
+  self.root = path.resolve(self.root)
+  if (process.platform === "win32")
+    self.root = self.root.replace(/\\/g, "/")
+
+  self.nomount = !!options.nomount
+
+  // disable comments and negation unless the user explicitly
+  // passes in false as the option.
+  options.nonegate = options.nonegate === false ? false : true
+  options.nocomment = options.nocomment === false ? false : true
+  deprecationWarning(options)
+
+  self.minimatch = new Minimatch(pattern, options)
+  self.options = self.minimatch.options
+}
+
+// TODO(isaacs): remove entirely in v6
+// exported to reset in tests
+exports.deprecationWarned
+function deprecationWarning(options) {
+  if (!options.nonegate || !options.nocomment) {
+    if (process.noDeprecation !== true && !exports.deprecationWarned) {
+      var msg = 'glob WARNING: comments and negation will be disabled in v6'
+      if (process.throwDeprecation)
+        throw new Error(msg)
+      else if (process.traceDeprecation)
+        console.trace(msg)
+      else
+        console.error(msg)
+
+      exports.deprecationWarned = true
+    }
+  }
+}
+
+function finish (self) {
+  var nou = self.nounique
+  var all = nou ? [] : Object.create(null)
+
+  for (var i = 0, l = self.matches.length; i < l; i ++) {
+    var matches = self.matches[i]
+    if (!matches || Object.keys(matches).length === 0) {
+      if (self.nonull) {
+        // do like the shell, and spit out the literal glob
+        var literal = self.minimatch.globSet[i]
+        if (nou)
+          all.push(literal)
+        else
+          all[literal] = true
+      }
+    } else {
+      // had matches
+      var m = Object.keys(matches)
+      if (nou)
+        all.push.apply(all, m)
+      else
+        m.forEach(function (m) {
+          all[m] = true
+        })
+    }
+  }
+
+  if (!nou)
+    all = Object.keys(all)
+
+  if (!self.nosort)
+    all = all.sort(self.nocase ? alphasorti : alphasort)
+
+  // at *some* point we statted all of these
+  if (self.mark) {
+    for (var i = 0; i < all.length; i++) {
+      all[i] = self._mark(all[i])
+    }
+    if (self.nodir) {
+      all = all.filter(function (e) {
+        return !(/\/$/.test(e))
+      })
+    }
+  }
+
+  if (self.ignore.length)
+    all = all.filter(function(m) {
+      return !isIgnored(self, m)
+    })
+
+  self.found = all
+}
+
+function mark (self, p) {
+  var abs = makeAbs(self, p)
+  var c = self.cache[abs]
+  var m = p
+  if (c) {
+    var isDir = c === 'DIR' || Array.isArray(c)
+    var slash = p.slice(-1) === '/'
+
+    if (isDir && !slash)
+      m += '/'
+    else if (!isDir && slash)
+      m = m.slice(0, -1)
+
+    if (m !== p) {
+      var mabs = makeAbs(self, m)
+      self.statCache[mabs] = self.statCache[abs]
+      self.cache[mabs] = self.cache[abs]
+    }
+  }
+
+  return m
+}
+
+// lotta situps...
+function makeAbs (self, f) {
+  var abs = f
+  if (f.charAt(0) === '/') {
+    abs = path.join(self.root, f)
+  } else if (isAbsolute(f) || f === '') {
+    abs = f
+  } else if (self.changedCwd) {
+    abs = path.resolve(self.cwd, f)
+  } else {
+    abs = path.resolve(f)
+  }
+  return abs
+}
+
+
+// Return true, if pattern ends with globstar '**', for the accompanying parent directory.
+// Ex:- If node_modules/** is the pattern, add 'node_modules' to ignore list along with it's contents
+function isIgnored (self, path) {
+  if (!self.ignore.length)
+    return false
+
+  return self.ignore.some(function(item) {
+    return item.matcher.match(path) || !!(item.gmatcher && item.gmatcher.match(path))
+  })
+}
+
+function childrenIgnored (self, path) {
+  if (!self.ignore.length)
+    return false
+
+  return self.ignore.some(function(item) {
+    return !!(item.gmatcher && item.gmatcher.match(path))
+  })
+}
+
+}).call(this,require('_process'))
+},{"_process":24,"minimatch":20,"path":22,"path-is-absolute":23}],16:[function(require,module,exports){
+(function (process){
+// Approach:
+//
+// 1. Get the minimatch set
+// 2. For each pattern in the set, PROCESS(pattern, false)
+// 3. Store matches per-set, then uniq them
+//
+// PROCESS(pattern, inGlobStar)
+// Get the first [n] items from pattern that are all strings
+// Join these together.  This is PREFIX.
+//   If there is no more remaining, then stat(PREFIX) and
+//   add to matches if it succeeds.  END.
+//
+// If inGlobStar and PREFIX is symlink and points to dir
+//   set ENTRIES = []
+// else readdir(PREFIX) as ENTRIES
+//   If fail, END
+//
+// with ENTRIES
+//   If pattern[n] is GLOBSTAR
+//     // handle the case where the globstar match is empty
+//     // by pruning it out, and testing the resulting pattern
+//     PROCESS(pattern[0..n] + pattern[n+1 .. $], false)
+//     // handle other cases.
+//     for ENTRY in ENTRIES (not dotfiles)
+//       // attach globstar + tail onto the entry
+//       // Mark that this entry is a globstar match
+//       PROCESS(pattern[0..n] + ENTRY + pattern[n .. $], true)
+//
+//   else // not globstar
+//     for ENTRY in ENTRIES (not dotfiles, unless pattern[n] is dot)
+//       Test ENTRY against pattern[n]
+//       If fails, continue
+//       If passes, PROCESS(pattern[0..n] + item + pattern[n+1 .. $])
+//
+// Caveat:
+//   Cache all stats and readdirs results to minimize syscall.  Since all
+//   we ever care about is existence and directory-ness, we can just keep
+//   `true` for files, and [children,...] for directories, or `false` for
+//   things that don't exist.
+
+module.exports = glob
+
+var fs = require('fs')
+var minimatch = require('minimatch')
+var Minimatch = minimatch.Minimatch
+var inherits = require('inherits')
+var EE = require('events').EventEmitter
+var path = require('path')
+var assert = require('assert')
+var isAbsolute = require('path-is-absolute')
+var globSync = require('./sync.js')
+var common = require('./common.js')
+var alphasort = common.alphasort
+var alphasorti = common.alphasorti
+var setopts = common.setopts
+var ownProp = common.ownProp
+var inflight = require('inflight')
+var util = require('util')
+var childrenIgnored = common.childrenIgnored
+var isIgnored = common.isIgnored
+
+var once = require('once')
+
+function glob (pattern, options, cb) {
+  if (typeof options === 'function') cb = options, options = {}
+  if (!options) options = {}
+
+  if (options.sync) {
+    if (cb)
+      throw new TypeError('callback provided to sync glob')
+    return globSync(pattern, options)
+  }
+
+  return new Glob(pattern, options, cb)
+}
+
+glob.sync = globSync
+var GlobSync = glob.GlobSync = globSync.GlobSync
+
+// old api surface
+glob.glob = glob
+
+glob.hasMagic = function (pattern, options_) {
+  var options = util._extend({}, options_)
+  options.noprocess = true
+
+  var g = new Glob(pattern, options)
+  var set = g.minimatch.set
+  if (set.length > 1)
+    return true
+
+  for (var j = 0; j < set[0].length; j++) {
+    if (typeof set[0][j] !== 'string')
+      return true
+  }
+
+  return false
+}
+
+glob.Glob = Glob
+inherits(Glob, EE)
+function Glob (pattern, options, cb) {
+  if (typeof options === 'function') {
+    cb = options
+    options = null
+  }
+
+  if (options && options.sync) {
+    if (cb)
+      throw new TypeError('callback provided to sync glob')
+    return new GlobSync(pattern, options)
+  }
+
+  if (!(this instanceof Glob))
+    return new Glob(pattern, options, cb)
+
+  setopts(this, pattern, options)
+  this._didRealPath = false
+
+  // process each pattern in the minimatch set
+  var n = this.minimatch.set.length
+
+  // The matches are stored as {<filename>: true,...} so that
+  // duplicates are automagically pruned.
+  // Later, we do an Object.keys() on these.
+  // Keep them as a list so we can fill in when nonull is set.
+  this.matches = new Array(n)
+
+  if (typeof cb === 'function') {
+    cb = once(cb)
+    this.on('error', cb)
+    this.on('end', function (matches) {
+      cb(null, matches)
+    })
+  }
+
+  var self = this
+  var n = this.minimatch.set.length
+  this._processing = 0
+  this.matches = new Array(n)
+
+  this._emitQueue = []
+  this._processQueue = []
+  this.paused = false
+
+  if (this.noprocess)
+    return this
+
+  if (n === 0)
+    return done()
+
+  for (var i = 0; i < n; i ++) {
+    this._process(this.minimatch.set[i], i, false, done)
+  }
+
+  function done () {
+    --self._processing
+    if (self._processing <= 0)
+      self._finish()
+  }
+}
+
+Glob.prototype._finish = function () {
+  assert(this instanceof Glob)
+  if (this.aborted)
+    return
+
+  if (this.realpath && !this._didRealpath)
+    return this._realpath()
+
+  common.finish(this)
+  this.emit('end', this.found)
+}
+
+Glob.prototype._realpath = function () {
+  if (this._didRealpath)
+    return
+
+  this._didRealpath = true
+
+  var n = this.matches.length
+  if (n === 0)
+    return this._finish()
+
+  var self = this
+  for (var i = 0; i < this.matches.length; i++)
+    this._realpathSet(i, next)
+
+  function next () {
+    if (--n === 0)
+      self._finish()
+  }
+}
+
+Glob.prototype._realpathSet = function (index, cb) {
+  var matchset = this.matches[index]
+  if (!matchset)
+    return cb()
+
+  var found = Object.keys(matchset)
+  var self = this
+  var n = found.length
+
+  if (n === 0)
+    return cb()
+
+  var set = this.matches[index] = Object.create(null)
+  found.forEach(function (p, i) {
+    // If there's a problem with the stat, then it means that
+    // one or more of the links in the realpath couldn't be
+    // resolved.  just return the abs value in that case.
+    p = self._makeAbs(p)
+    fs.realpath(p, self.realpathCache, function (er, real) {
+      if (!er)
+        set[real] = true
+      else if (er.syscall === 'stat')
+        set[p] = true
+      else
+        self.emit('error', er) // srsly wtf right here
+
+      if (--n === 0) {
+        self.matches[index] = set
+        cb()
+      }
+    })
+  })
+}
+
+Glob.prototype._mark = function (p) {
+  return common.mark(this, p)
+}
+
+Glob.prototype._makeAbs = function (f) {
+  return common.makeAbs(this, f)
+}
+
+Glob.prototype.abort = function () {
+  this.aborted = true
+  this.emit('abort')
+}
+
+Glob.prototype.pause = function () {
+  if (!this.paused) {
+    this.paused = true
+    this.emit('pause')
+  }
+}
+
+Glob.prototype.resume = function () {
+  if (this.paused) {
+    this.emit('resume')
+    this.paused = false
+    if (this._emitQueue.length) {
+      var eq = this._emitQueue.slice(0)
+      this._emitQueue.length = 0
+      for (var i = 0; i < eq.length; i ++) {
+        var e = eq[i]
+        this._emitMatch(e[0], e[1])
+      }
+    }
+    if (this._processQueue.length) {
+      var pq = this._processQueue.slice(0)
+      this._processQueue.length = 0
+      for (var i = 0; i < pq.length; i ++) {
+        var p = pq[i]
+        this._processing--
+        this._process(p[0], p[1], p[2], p[3])
+      }
+    }
+  }
+}
+
+Glob.prototype._process = function (pattern, index, inGlobStar, cb) {
+  assert(this instanceof Glob)
+  assert(typeof cb === 'function')
+
+  if (this.aborted)
+    return
+
+  this._processing++
+  if (this.paused) {
+    this._processQueue.push([pattern, index, inGlobStar, cb])
+    return
+  }
+
+  //console.error('PROCESS %d', this._processing, pattern)
+
+  // Get the first [n] parts of pattern that are all strings.
+  var n = 0
+  while (typeof pattern[n] === 'string') {
+    n ++
+  }
+  // now n is the index of the first one that is *not* a string.
+
+  // see if there's anything else
+  var prefix
+  switch (n) {
+    // if not, then this is rather simple
+    case pattern.length:
+      this._processSimple(pattern.join('/'), index, cb)
+      return
+
+    case 0:
+      // pattern *starts* with some non-trivial item.
+      // going to readdir(cwd), but not include the prefix in matches.
+      prefix = null
+      break
+
+    default:
+      // pattern has some string bits in the front.
+      // whatever it starts with, whether that's 'absolute' like /foo/bar,
+      // or 'relative' like '../baz'
+      prefix = pattern.slice(0, n).join('/')
+      break
+  }
+
+  var remain = pattern.slice(n)
+
+  // get the list of entries.
+  var read
+  if (prefix === null)
+    read = '.'
+  else if (isAbsolute(prefix) || isAbsolute(pattern.join('/'))) {
+    if (!prefix || !isAbsolute(prefix))
+      prefix = '/' + prefix
+    read = prefix
+  } else
+    read = prefix
+
+  var abs = this._makeAbs(read)
+
+  //if ignored, skip _processing
+  if (childrenIgnored(this, read))
+    return cb()
+
+  var isGlobStar = remain[0] === minimatch.GLOBSTAR
+  if (isGlobStar)
+    this._processGlobStar(prefix, read, abs, remain, index, inGlobStar, cb)
+  else
+    this._processReaddir(prefix, read, abs, remain, index, inGlobStar, cb)
+}
+
+Glob.prototype._processReaddir = function (prefix, read, abs, remain, index, inGlobStar, cb) {
+  var self = this
+  this._readdir(abs, inGlobStar, function (er, entries) {
+    return self._processReaddir2(prefix, read, abs, remain, index, inGlobStar, entries, cb)
+  })
+}
+
+Glob.prototype._processReaddir2 = function (prefix, read, abs, remain, index, inGlobStar, entries, cb) {
+
+  // if the abs isn't a dir, then nothing can match!
+  if (!entries)
+    return cb()
+
+  // It will only match dot entries if it starts with a dot, or if
+  // dot is set.  Stuff like @(.foo|.bar) isn't allowed.
+  var pn = remain[0]
+  var negate = !!this.minimatch.negate
+  var rawGlob = pn._glob
+  var dotOk = this.dot || rawGlob.charAt(0) === '.'
+
+  var matchedEntries = []
+  for (var i = 0; i < entries.length; i++) {
+    var e = entries[i]
+    if (e.charAt(0) !== '.' || dotOk) {
+      var m
+      if (negate && !prefix) {
+        m = !e.match(pn)
+      } else {
+        m = e.match(pn)
+      }
+      if (m)
+        matchedEntries.push(e)
+    }
+  }
+
+  //console.error('prd2', prefix, entries, remain[0]._glob, matchedEntries)
+
+  var len = matchedEntries.length
+  // If there are no matched entries, then nothing matches.
+  if (len === 0)
+    return cb()
+
+  // if this is the last remaining pattern bit, then no need for
+  // an additional stat *unless* the user has specified mark or
+  // stat explicitly.  We know they exist, since readdir returned
+  // them.
+
+  if (remain.length === 1 && !this.mark && !this.stat) {
+    if (!this.matches[index])
+      this.matches[index] = Object.create(null)
+
+    for (var i = 0; i < len; i ++) {
+      var e = matchedEntries[i]
+      if (prefix) {
+        if (prefix !== '/')
+          e = prefix + '/' + e
+        else
+          e = prefix + e
+      }
+
+      if (e.charAt(0) === '/' && !this.nomount) {
+        e = path.join(this.root, e)
+      }
+      this._emitMatch(index, e)
+    }
+    // This was the last one, and no stats were needed
+    return cb()
+  }
+
+  // now test all matched entries as stand-ins for that part
+  // of the pattern.
+  remain.shift()
+  for (var i = 0; i < len; i ++) {
+    var e = matchedEntries[i]
+    var newPattern
+    if (prefix) {
+      if (prefix !== '/')
+        e = prefix + '/' + e
+      else
+        e = prefix + e
+    }
+    this._process([e].concat(remain), index, inGlobStar, cb)
+  }
+  cb()
+}
+
+Glob.prototype._emitMatch = function (index, e) {
+  if (this.aborted)
+    return
+
+  if (this.matches[index][e])
+    return
+
+  if (isIgnored(this, e))
+    return
+
+  if (this.paused) {
+    this._emitQueue.push([index, e])
+    return
+  }
+
+  var abs = this._makeAbs(e)
+
+  if (this.nodir) {
+    var c = this.cache[abs]
+    if (c === 'DIR' || Array.isArray(c))
+      return
+  }
+
+  if (this.mark)
+    e = this._mark(e)
+
+  this.matches[index][e] = true
+
+  var st = this.statCache[abs]
+  if (st)
+    this.emit('stat', e, st)
+
+  this.emit('match', e)
+}
+
+Glob.prototype._readdirInGlobStar = function (abs, cb) {
+  if (this.aborted)
+    return
+
+  // follow all symlinked directories forever
+  // just proceed as if this is a non-globstar situation
+  if (this.follow)
+    return this._readdir(abs, false, cb)
+
+  var lstatkey = 'lstat\0' + abs
+  var self = this
+  var lstatcb = inflight(lstatkey, lstatcb_)
+
+  if (lstatcb)
+    fs.lstat(abs, lstatcb)
+
+  function lstatcb_ (er, lstat) {
+    if (er)
+      return cb()
+
+    var isSym = lstat.isSymbolicLink()
+    self.symlinks[abs] = isSym
+
+    // If it's not a symlink or a dir, then it's definitely a regular file.
+    // don't bother doing a readdir in that case.
+    if (!isSym && !lstat.isDirectory()) {
+      self.cache[abs] = 'FILE'
+      cb()
+    } else
+      self._readdir(abs, false, cb)
+  }
+}
+
+Glob.prototype._readdir = function (abs, inGlobStar, cb) {
+  if (this.aborted)
+    return
+
+  cb = inflight('readdir\0'+abs+'\0'+inGlobStar, cb)
+  if (!cb)
+    return
+
+  //console.error('RD %j %j', +inGlobStar, abs)
+  if (inGlobStar && !ownProp(this.symlinks, abs))
+    return this._readdirInGlobStar(abs, cb)
+
+  if (ownProp(this.cache, abs)) {
+    var c = this.cache[abs]
+    if (!c || c === 'FILE')
+      return cb()
+
+    if (Array.isArray(c))
+      return cb(null, c)
+  }
+
+  var self = this
+  fs.readdir(abs, readdirCb(this, abs, cb))
+}
+
+function readdirCb (self, abs, cb) {
+  return function (er, entries) {
+    if (er)
+      self._readdirError(abs, er, cb)
+    else
+      self._readdirEntries(abs, entries, cb)
+  }
+}
+
+Glob.prototype._readdirEntries = function (abs, entries, cb) {
+  if (this.aborted)
+    return
+
+  // if we haven't asked to stat everything, then just
+  // assume that everything in there exists, so we can avoid
+  // having to stat it a second time.
+  if (!this.mark && !this.stat) {
+    for (var i = 0; i < entries.length; i ++) {
+      var e = entries[i]
+      if (abs === '/')
+        e = abs + e
+      else
+        e = abs + '/' + e
+      this.cache[e] = true
+    }
+  }
+
+  this.cache[abs] = entries
+  return cb(null, entries)
+}
+
+Glob.prototype._readdirError = function (f, er, cb) {
+  if (this.aborted)
+    return
+
+  // handle errors, and cache the information
+  switch (er.code) {
+    case 'ENOTSUP': // https://github.com/isaacs/node-glob/issues/205
+    case 'ENOTDIR': // totally normal. means it *does* exist.
+      this.cache[this._makeAbs(f)] = 'FILE'
+      break
+
+    case 'ENOENT': // not terribly unusual
+    case 'ELOOP':
+    case 'ENAMETOOLONG':
+    case 'UNKNOWN':
+      this.cache[this._makeAbs(f)] = false
+      break
+
+    default: // some unusual error.  Treat as failure.
+      this.cache[this._makeAbs(f)] = false
+      if (this.strict) {
+        this.emit('error', er)
+        // If the error is handled, then we abort
+        // if not, we threw out of here
+        this.abort()
+      }
+      if (!this.silent)
+        console.error('glob error', er)
+      break
+  }
+
+  return cb()
+}
+
+Glob.prototype._processGlobStar = function (prefix, read, abs, remain, index, inGlobStar, cb) {
+  var self = this
+  this._readdir(abs, inGlobStar, function (er, entries) {
+    self._processGlobStar2(prefix, read, abs, remain, index, inGlobStar, entries, cb)
+  })
+}
+
+
+Glob.prototype._processGlobStar2 = function (prefix, read, abs, remain, index, inGlobStar, entries, cb) {
+  //console.error('pgs2', prefix, remain[0], entries)
+
+  // no entries means not a dir, so it can never have matches
+  // foo.txt/** doesn't match foo.txt
+  if (!entries)
+    return cb()
+
+  // test without the globstar, and with every child both below
+  // and replacing the globstar.
+  var remainWithoutGlobStar = remain.slice(1)
+  var gspref = prefix ? [ prefix ] : []
+  var noGlobStar = gspref.concat(remainWithoutGlobStar)
+
+  // the noGlobStar pattern exits the inGlobStar state
+  this._process(noGlobStar, index, false, cb)
+
+  var isSym = this.symlinks[abs]
+  var len = entries.length
+
+  // If it's a symlink, and we're in a globstar, then stop
+  if (isSym && inGlobStar)
+    return cb()
+
+  for (var i = 0; i < len; i++) {
+    var e = entries[i]
+    if (e.charAt(0) === '.' && !this.dot)
+      continue
+
+    // these two cases enter the inGlobStar state
+    var instead = gspref.concat(entries[i], remainWithoutGlobStar)
+    this._process(instead, index, true, cb)
+
+    var below = gspref.concat(entries[i], remain)
+    this._process(below, index, true, cb)
+  }
+
+  cb()
+}
+
+Glob.prototype._processSimple = function (prefix, index, cb) {
+  // XXX review this.  Shouldn't it be doing the mounting etc
+  // before doing stat?  kinda weird?
+  var self = this
+  this._stat(prefix, function (er, exists) {
+    self._processSimple2(prefix, index, er, exists, cb)
+  })
+}
+Glob.prototype._processSimple2 = function (prefix, index, er, exists, cb) {
+
+  //console.error('ps2', prefix, exists)
+
+  if (!this.matches[index])
+    this.matches[index] = Object.create(null)
+
+  // If it doesn't exist, then just mark the lack of results
+  if (!exists)
+    return cb()
+
+  if (prefix && isAbsolute(prefix) && !this.nomount) {
+    var trail = /[\/\\]$/.test(prefix)
+    if (prefix.charAt(0) === '/') {
+      prefix = path.join(this.root, prefix)
+    } else {
+      prefix = path.resolve(this.root, prefix)
+      if (trail)
+        prefix += '/'
+    }
+  }
+
+  if (process.platform === 'win32')
+    prefix = prefix.replace(/\\/g, '/')
+
+  // Mark this as a match
+  this._emitMatch(index, prefix)
+  cb()
+}
+
+// Returns either 'DIR', 'FILE', or false
+Glob.prototype._stat = function (f, cb) {
+  var abs = this._makeAbs(f)
+  var needDir = f.slice(-1) === '/'
+
+  if (f.length > this.maxLength)
+    return cb()
+
+  if (!this.stat && ownProp(this.cache, abs)) {
+    var c = this.cache[abs]
+
+    if (Array.isArray(c))
+      c = 'DIR'
+
+    // It exists, but maybe not how we need it
+    if (!needDir || c === 'DIR')
+      return cb(null, c)
+
+    if (needDir && c === 'FILE')
+      return cb()
+
+    // otherwise we have to stat, because maybe c=true
+    // if we know it exists, but not what it is.
+  }
+
+  var exists
+  var stat = this.statCache[abs]
+  if (stat !== undefined) {
+    if (stat === false)
+      return cb(null, stat)
+    else {
+      var type = stat.isDirectory() ? 'DIR' : 'FILE'
+      if (needDir && type === 'FILE')
+        return cb()
+      else
+        return cb(null, type, stat)
+    }
+  }
+
+  var self = this
+  var statcb = inflight('stat\0' + abs, lstatcb_)
+  if (statcb)
+    fs.lstat(abs, statcb)
+
+  function lstatcb_ (er, lstat) {
+    if (lstat && lstat.isSymbolicLink()) {
+      // If it's a symlink, then treat it as the target, unless
+      // the target does not exist, then treat it as a file.
+      return fs.stat(abs, function (er, stat) {
+        if (er)
+          self._stat2(f, abs, null, lstat, cb)
+        else
+          self._stat2(f, abs, er, stat, cb)
+      })
+    } else {
+      self._stat2(f, abs, er, lstat, cb)
+    }
+  }
+}
+
+Glob.prototype._stat2 = function (f, abs, er, stat, cb) {
+  if (er) {
+    this.statCache[abs] = false
+    return cb()
+  }
+
+  var needDir = f.slice(-1) === '/'
+  this.statCache[abs] = stat
+
+  if (abs.slice(-1) === '/' && !stat.isDirectory())
+    return cb(null, false, stat)
+
+  var c = stat.isDirectory() ? 'DIR' : 'FILE'
+  this.cache[abs] = this.cache[abs] || c
+
+  if (needDir && c !== 'DIR')
+    return cb()
+
+  return cb(null, c, stat)
+}
+
+}).call(this,require('_process'))
+},{"./common.js":15,"./sync.js":17,"_process":24,"assert":9,"events":14,"fs":12,"inflight":18,"inherits":19,"minimatch":20,"once":21,"path":22,"path-is-absolute":23,"util":28}],17:[function(require,module,exports){
+(function (process){
+module.exports = globSync
+globSync.GlobSync = GlobSync
+
+var fs = require('fs')
+var minimatch = require('minimatch')
+var Minimatch = minimatch.Minimatch
+var Glob = require('./glob.js').Glob
+var util = require('util')
+var path = require('path')
+var assert = require('assert')
+var isAbsolute = require('path-is-absolute')
+var common = require('./common.js')
+var alphasort = common.alphasort
+var alphasorti = common.alphasorti
+var setopts = common.setopts
+var ownProp = common.ownProp
+var childrenIgnored = common.childrenIgnored
+
+function globSync (pattern, options) {
+  if (typeof options === 'function' || arguments.length === 3)
+    throw new TypeError('callback provided to sync glob\n'+
+                        'See: https://github.com/isaacs/node-glob/issues/167')
+
+  return new GlobSync(pattern, options).found
+}
+
+function GlobSync (pattern, options) {
+  if (!pattern)
+    throw new Error('must provide pattern')
+
+  if (typeof options === 'function' || arguments.length === 3)
+    throw new TypeError('callback provided to sync glob\n'+
+                        'See: https://github.com/isaacs/node-glob/issues/167')
+
+  if (!(this instanceof GlobSync))
+    return new GlobSync(pattern, options)
+
+  setopts(this, pattern, options)
+
+  if (this.noprocess)
+    return this
+
+  var n = this.minimatch.set.length
+  this.matches = new Array(n)
+  for (var i = 0; i < n; i ++) {
+    this._process(this.minimatch.set[i], i, false)
+  }
+  this._finish()
+}
+
+GlobSync.prototype._finish = function () {
+  assert(this instanceof GlobSync)
+  if (this.realpath) {
+    var self = this
+    this.matches.forEach(function (matchset, index) {
+      var set = self.matches[index] = Object.create(null)
+      for (var p in matchset) {
+        try {
+          p = self._makeAbs(p)
+          var real = fs.realpathSync(p, self.realpathCache)
+          set[real] = true
+        } catch (er) {
+          if (er.syscall === 'stat')
+            set[self._makeAbs(p)] = true
+          else
+            throw er
+        }
+      }
+    })
+  }
+  common.finish(this)
+}
+
+
+GlobSync.prototype._process = function (pattern, index, inGlobStar) {
+  assert(this instanceof GlobSync)
+
+  // Get the first [n] parts of pattern that are all strings.
+  var n = 0
+  while (typeof pattern[n] === 'string') {
+    n ++
+  }
+  // now n is the index of the first one that is *not* a string.
+
+  // See if there's anything else
+  var prefix
+  switch (n) {
+    // if not, then this is rather simple
+    case pattern.length:
+      this._processSimple(pattern.join('/'), index)
+      return
+
+    case 0:
+      // pattern *starts* with some non-trivial item.
+      // going to readdir(cwd), but not include the prefix in matches.
+      prefix = null
+      break
+
+    default:
+      // pattern has some string bits in the front.
+      // whatever it starts with, whether that's 'absolute' like /foo/bar,
+      // or 'relative' like '../baz'
+      prefix = pattern.slice(0, n).join('/')
+      break
+  }
+
+  var remain = pattern.slice(n)
+
+  // get the list of entries.
+  var read
+  if (prefix === null)
+    read = '.'
+  else if (isAbsolute(prefix) || isAbsolute(pattern.join('/'))) {
+    if (!prefix || !isAbsolute(prefix))
+      prefix = '/' + prefix
+    read = prefix
+  } else
+    read = prefix
+
+  var abs = this._makeAbs(read)
+
+  //if ignored, skip processing
+  if (childrenIgnored(this, read))
+    return
+
+  var isGlobStar = remain[0] === minimatch.GLOBSTAR
+  if (isGlobStar)
+    this._processGlobStar(prefix, read, abs, remain, index, inGlobStar)
+  else
+    this._processReaddir(prefix, read, abs, remain, index, inGlobStar)
+}
+
+
+GlobSync.prototype._processReaddir = function (prefix, read, abs, remain, index, inGlobStar) {
+  var entries = this._readdir(abs, inGlobStar)
+
+  // if the abs isn't a dir, then nothing can match!
+  if (!entries)
+    return
+
+  // It will only match dot entries if it starts with a dot, or if
+  // dot is set.  Stuff like @(.foo|.bar) isn't allowed.
+  var pn = remain[0]
+  var negate = !!this.minimatch.negate
+  var rawGlob = pn._glob
+  var dotOk = this.dot || rawGlob.charAt(0) === '.'
+
+  var matchedEntries = []
+  for (var i = 0; i < entries.length; i++) {
+    var e = entries[i]
+    if (e.charAt(0) !== '.' || dotOk) {
+      var m
+      if (negate && !prefix) {
+        m = !e.match(pn)
+      } else {
+        m = e.match(pn)
+      }
+      if (m)
+        matchedEntries.push(e)
+    }
+  }
+
+  var len = matchedEntries.length
+  // If there are no matched entries, then nothing matches.
+  if (len === 0)
+    return
+
+  // if this is the last remaining pattern bit, then no need for
+  // an additional stat *unless* the user has specified mark or
+  // stat explicitly.  We know they exist, since readdir returned
+  // them.
+
+  if (remain.length === 1 && !this.mark && !this.stat) {
+    if (!this.matches[index])
+      this.matches[index] = Object.create(null)
+
+    for (var i = 0; i < len; i ++) {
+      var e = matchedEntries[i]
+      if (prefix) {
+        if (prefix.slice(-1) !== '/')
+          e = prefix + '/' + e
+        else
+          e = prefix + e
+      }
+
+      if (e.charAt(0) === '/' && !this.nomount) {
+        e = path.join(this.root, e)
+      }
+      this.matches[index][e] = true
+    }
+    // This was the last one, and no stats were needed
+    return
+  }
+
+  // now test all matched entries as stand-ins for that part
+  // of the pattern.
+  remain.shift()
+  for (var i = 0; i < len; i ++) {
+    var e = matchedEntries[i]
+    var newPattern
+    if (prefix)
+      newPattern = [prefix, e]
+    else
+      newPattern = [e]
+    this._process(newPattern.concat(remain), index, inGlobStar)
+  }
+}
+
+
+GlobSync.prototype._emitMatch = function (index, e) {
+  var abs = this._makeAbs(e)
+  if (this.mark)
+    e = this._mark(e)
+
+  if (this.matches[index][e])
+    return
+
+  if (this.nodir) {
+    var c = this.cache[this._makeAbs(e)]
+    if (c === 'DIR' || Array.isArray(c))
+      return
+  }
+
+  this.matches[index][e] = true
+  if (this.stat)
+    this._stat(e)
+}
+
+
+GlobSync.prototype._readdirInGlobStar = function (abs) {
+  // follow all symlinked directories forever
+  // just proceed as if this is a non-globstar situation
+  if (this.follow)
+    return this._readdir(abs, false)
+
+  var entries
+  var lstat
+  var stat
+  try {
+    lstat = fs.lstatSync(abs)
+  } catch (er) {
+    // lstat failed, doesn't exist
+    return null
+  }
+
+  var isSym = lstat.isSymbolicLink()
+  this.symlinks[abs] = isSym
+
+  // If it's not a symlink or a dir, then it's definitely a regular file.
+  // don't bother doing a readdir in that case.
+  if (!isSym && !lstat.isDirectory())
+    this.cache[abs] = 'FILE'
+  else
+    entries = this._readdir(abs, false)
+
+  return entries
+}
+
+GlobSync.prototype._readdir = function (abs, inGlobStar) {
+  var entries
+
+  if (inGlobStar && !ownProp(this.symlinks, abs))
+    return this._readdirInGlobStar(abs)
+
+  if (ownProp(this.cache, abs)) {
+    var c = this.cache[abs]
+    if (!c || c === 'FILE')
+      return null
+
+    if (Array.isArray(c))
+      return c
+  }
+
+  try {
+    return this._readdirEntries(abs, fs.readdirSync(abs))
+  } catch (er) {
+    this._readdirError(abs, er)
+    return null
+  }
+}
+
+GlobSync.prototype._readdirEntries = function (abs, entries) {
+  // if we haven't asked to stat everything, then just
+  // assume that everything in there exists, so we can avoid
+  // having to stat it a second time.
+  if (!this.mark && !this.stat) {
+    for (var i = 0; i < entries.length; i ++) {
+      var e = entries[i]
+      if (abs === '/')
+        e = abs + e
+      else
+        e = abs + '/' + e
+      this.cache[e] = true
+    }
+  }
+
+  this.cache[abs] = entries
+
+  // mark and cache dir-ness
+  return entries
+}
+
+GlobSync.prototype._readdirError = function (f, er) {
+  // handle errors, and cache the information
+  switch (er.code) {
+    case 'ENOTSUP': // https://github.com/isaacs/node-glob/issues/205
+    case 'ENOTDIR': // totally normal. means it *does* exist.
+      this.cache[this._makeAbs(f)] = 'FILE'
+      break
+
+    case 'ENOENT': // not terribly unusual
+    case 'ELOOP':
+    case 'ENAMETOOLONG':
+    case 'UNKNOWN':
+      this.cache[this._makeAbs(f)] = false
+      break
+
+    default: // some unusual error.  Treat as failure.
+      this.cache[this._makeAbs(f)] = false
+      if (this.strict)
+        throw er
+      if (!this.silent)
+        console.error('glob error', er)
+      break
+  }
+}
+
+GlobSync.prototype._processGlobStar = function (prefix, read, abs, remain, index, inGlobStar) {
+
+  var entries = this._readdir(abs, inGlobStar)
+
+  // no entries means not a dir, so it can never have matches
+  // foo.txt/** doesn't match foo.txt
+  if (!entries)
+    return
+
+  // test without the globstar, and with every child both below
+  // and replacing the globstar.
+  var remainWithoutGlobStar = remain.slice(1)
+  var gspref = prefix ? [ prefix ] : []
+  var noGlobStar = gspref.concat(remainWithoutGlobStar)
+
+  // the noGlobStar pattern exits the inGlobStar state
+  this._process(noGlobStar, index, false)
+
+  var len = entries.length
+  var isSym = this.symlinks[abs]
+
+  // If it's a symlink, and we're in a globstar, then stop
+  if (isSym && inGlobStar)
+    return
+
+  for (var i = 0; i < len; i++) {
+    var e = entries[i]
+    if (e.charAt(0) === '.' && !this.dot)
+      continue
+
+    // these two cases enter the inGlobStar state
+    var instead = gspref.concat(entries[i], remainWithoutGlobStar)
+    this._process(instead, index, true)
+
+    var below = gspref.concat(entries[i], remain)
+    this._process(below, index, true)
+  }
+}
+
+GlobSync.prototype._processSimple = function (prefix, index) {
+  // XXX review this.  Shouldn't it be doing the mounting etc
+  // before doing stat?  kinda weird?
+  var exists = this._stat(prefix)
+
+  if (!this.matches[index])
+    this.matches[index] = Object.create(null)
+
+  // If it doesn't exist, then just mark the lack of results
+  if (!exists)
+    return
+
+  if (prefix && isAbsolute(prefix) && !this.nomount) {
+    var trail = /[\/\\]$/.test(prefix)
+    if (prefix.charAt(0) === '/') {
+      prefix = path.join(this.root, prefix)
+    } else {
+      prefix = path.resolve(this.root, prefix)
+      if (trail)
+        prefix += '/'
+    }
+  }
+
+  if (process.platform === 'win32')
+    prefix = prefix.replace(/\\/g, '/')
+
+  // Mark this as a match
+  this.matches[index][prefix] = true
+}
+
+// Returns either 'DIR', 'FILE', or false
+GlobSync.prototype._stat = function (f) {
+  var abs = this._makeAbs(f)
+  var needDir = f.slice(-1) === '/'
+
+  if (f.length > this.maxLength)
+    return false
+
+  if (!this.stat && ownProp(this.cache, abs)) {
+    var c = this.cache[abs]
+
+    if (Array.isArray(c))
+      c = 'DIR'
+
+    // It exists, but maybe not how we need it
+    if (!needDir || c === 'DIR')
+      return c
+
+    if (needDir && c === 'FILE')
+      return false
+
+    // otherwise we have to stat, because maybe c=true
+    // if we know it exists, but not what it is.
+  }
+
+  var exists
+  var stat = this.statCache[abs]
+  if (!stat) {
+    var lstat
+    try {
+      lstat = fs.lstatSync(abs)
+    } catch (er) {
+      return false
+    }
+
+    if (lstat.isSymbolicLink()) {
+      try {
+        stat = fs.statSync(abs)
+      } catch (er) {
+        stat = lstat
+      }
+    } else {
+      stat = lstat
+    }
+  }
+
+  this.statCache[abs] = stat
+
+  var c = stat.isDirectory() ? 'DIR' : 'FILE'
+  this.cache[abs] = this.cache[abs] || c
+
+  if (needDir && c !== 'DIR')
+    return false
+
+  return c
+}
+
+GlobSync.prototype._mark = function (p) {
+  return common.mark(this, p)
+}
+
+GlobSync.prototype._makeAbs = function (f) {
+  return common.makeAbs(this, f)
+}
+
+}).call(this,require('_process'))
+},{"./common.js":15,"./glob.js":16,"_process":24,"assert":9,"fs":12,"minimatch":20,"path":22,"path-is-absolute":23,"util":28}],18:[function(require,module,exports){
+(function (process){
+var wrappy = require('wrappy')
+var reqs = Object.create(null)
+var once = require('once')
+
+module.exports = wrappy(inflight)
+
+function inflight (key, cb) {
+  if (reqs[key]) {
+    reqs[key].push(cb)
+    return null
+  } else {
+    reqs[key] = [cb]
+    return makeres(key)
+  }
+}
+
+function makeres (key) {
+  return once(function RES () {
+    var cbs = reqs[key]
+    var len = cbs.length
+    var args = slice(arguments)
+
+    // XXX It's somewhat ambiguous whether a new callback added in this
+    // pass should be queued for later execution if something in the
+    // list of callbacks throws, or if it should just be discarded.
+    // However, it's such an edge case that it hardly matters, and either
+    // choice is likely as surprising as the other.
+    // As it happens, we do go ahead and schedule it for later execution.
+    try {
+      for (var i = 0; i < len; i++) {
+        cbs[i].apply(null, args)
+      }
+    } finally {
+      if (cbs.length > len) {
+        // added more in the interim.
+        // de-zalgo, just in case, but don't call again.
+        cbs.splice(0, len)
+        process.nextTick(function () {
+          RES.apply(null, args)
+        })
+      } else {
+        delete reqs[key]
+      }
+    }
+  })
+}
+
+function slice (args) {
+  var length = args.length
+  var array = []
+
+  for (var i = 0; i < length; i++) array[i] = args[i]
+  return array
+}
+
+}).call(this,require('_process'))
+},{"_process":24,"once":21,"wrappy":29}],19:[function(require,module,exports){
+if (typeof Object.create === 'function') {
+  // implementation from standard node.js 'util' module
+  module.exports = function inherits(ctor, superCtor) {
+    ctor.super_ = superCtor
+    ctor.prototype = Object.create(superCtor.prototype, {
+      constructor: {
+        value: ctor,
+        enumerable: false,
+        writable: true,
+        configurable: true
+      }
+    });
+  };
+} else {
+  // old school shim for old browsers
+  module.exports = function inherits(ctor, superCtor) {
+    ctor.super_ = superCtor
+    var TempCtor = function () {}
+    TempCtor.prototype = superCtor.prototype
+    ctor.prototype = new TempCtor()
+    ctor.prototype.constructor = ctor
+  }
+}
+
+},{}],20:[function(require,module,exports){
+module.exports = minimatch
+minimatch.Minimatch = Minimatch
+
+var path = { sep: '/' }
+try {
+  path = require('path')
+} catch (er) {}
+
+var GLOBSTAR = minimatch.GLOBSTAR = Minimatch.GLOBSTAR = {}
+var expand = require('brace-expansion')
+
+var plTypes = {
+  '!': { open: '(?:(?!(?:', close: '))[^/]*?)'},
+  '?': { open: '(?:', close: ')?' },
+  '+': { open: '(?:', close: ')+' },
+  '*': { open: '(?:', close: ')*' },
+  '@': { open: '(?:', close: ')' }
+}
+
+// any single thing other than /
+// don't need to escape / when using new RegExp()
+var qmark = '[^/]'
+
+// * => any number of characters
+var star = qmark + '*?'
+
+// ** when dots are allowed.  Anything goes, except .. and .
+// not (^ or / followed by one or two dots followed by $ or /),
+// followed by anything, any number of times.
+var twoStarDot = '(?:(?!(?:\\\/|^)(?:\\.{1,2})($|\\\/)).)*?'
+
+// not a ^ or / followed by a dot,
+// followed by anything, any number of times.
+var twoStarNoDot = '(?:(?!(?:\\\/|^)\\.).)*?'
+
+// characters that need to be escaped in RegExp.
+var reSpecials = charSet('().*{}+?[]^$\\!')
+
+// "abc" -> { a:true, b:true, c:true }
+function charSet (s) {
+  return s.split('').reduce(function (set, c) {
+    set[c] = true
+    return set
+  }, {})
+}
+
+// normalizes slashes.
+var slashSplit = /\/+/
+
+minimatch.filter = filter
+function filter (pattern, options) {
+  options = options || {}
+  return function (p, i, list) {
+    return minimatch(p, pattern, options)
+  }
+}
+
+function ext (a, b) {
+  a = a || {}
+  b = b || {}
+  var t = {}
+  Object.keys(b).forEach(function (k) {
+    t[k] = b[k]
+  })
+  Object.keys(a).forEach(function (k) {
+    t[k] = a[k]
+  })
+  return t
+}
+
+minimatch.defaults = function (def) {
+  if (!def || !Object.keys(def).length) return minimatch
+
+  var orig = minimatch
+
+  var m = function minimatch (p, pattern, options) {
+    return orig.minimatch(p, pattern, ext(def, options))
+  }
+
+  m.Minimatch = function Minimatch (pattern, options) {
+    return new orig.Minimatch(pattern, ext(def, options))
+  }
+
+  return m
+}
+
+Minimatch.defaults = function (def) {
+  if (!def || !Object.keys(def).length) return Minimatch
+  return minimatch.defaults(def).Minimatch
+}
+
+function minimatch (p, pattern, options) {
+  if (typeof pattern !== 'string') {
+    throw new TypeError('glob pattern string required')
+  }
+
+  if (!options) options = {}
+
+  // shortcut: comments match nothing.
+  if (!options.nocomment && pattern.charAt(0) === '#') {
+    return false
+  }
+
+  // "" only matches ""
+  if (pattern.trim() === '') return p === ''
+
+  return new Minimatch(pattern, options).match(p)
+}
+
+function Minimatch (pattern, options) {
+  if (!(this instanceof Minimatch)) {
+    return new Minimatch(pattern, options)
+  }
+
+  if (typeof pattern !== 'string') {
+    throw new TypeError('glob pattern string required')
+  }
+
+  if (!options) options = {}
+  pattern = pattern.trim()
+
+  // windows support: need to use /, not \
+  if (path.sep !== '/') {
+    pattern = pattern.split(path.sep).join('/')
+  }
+
+  this.options = options
+  this.set = []
+  this.pattern = pattern
+  this.regexp = null
+  this.negate = false
+  this.comment = false
+  this.empty = false
+
+  // make the set of regexps etc.
+  this.make()
+}
+
+Minimatch.prototype.debug = function () {}
+
+Minimatch.prototype.make = make
+function make () {
+  // don't do it more than once.
+  if (this._made) return
+
+  var pattern = this.pattern
+  var options = this.options
+
+  // empty patterns and comments match nothing.
+  if (!options.nocomment && pattern.charAt(0) === '#') {
+    this.comment = true
+    return
+  }
+  if (!pattern) {
+    this.empty = true
+    return
+  }
+
+  // step 1: figure out negation, etc.
+  this.parseNegate()
+
+  // step 2: expand braces
+  var set = this.globSet = this.braceExpand()
+
+  if (options.debug) this.debug = console.error
+
+  this.debug(this.pattern, set)
+
+  // step 3: now we have a set, so turn each one into a series of path-portion
+  // matching patterns.
+  // These will be regexps, except in the case of "**", which is
+  // set to the GLOBSTAR object for globstar behavior,
+  // and will not contain any / characters
+  set = this.globParts = set.map(function (s) {
+    return s.split(slashSplit)
+  })
+
+  this.debug(this.pattern, set)
+
+  // glob --> regexps
+  set = set.map(function (s, si, set) {
+    return s.map(this.parse, this)
+  }, this)
+
+  this.debug(this.pattern, set)
+
+  // filter out everything that didn't compile properly.
+  set = set.filter(function (s) {
+    return s.indexOf(false) === -1
+  })
+
+  this.debug(this.pattern, set)
+
+  this.set = set
+}
+
+Minimatch.prototype.parseNegate = parseNegate
+function parseNegate () {
+  var pattern = this.pattern
+  var negate = false
+  var options = this.options
+  var negateOffset = 0
+
+  if (options.nonegate) return
+
+  for (var i = 0, l = pattern.length
+    ; i < l && pattern.charAt(i) === '!'
+    ; i++) {
+    negate = !negate
+    negateOffset++
+  }
+
+  if (negateOffset) this.pattern = pattern.substr(negateOffset)
+  this.negate = negate
+}
+
+// Brace expansion:
+// a{b,c}d -> abd acd
+// a{b,}c -> abc ac
+// a{0..3}d -> a0d a1d a2d a3d
+// a{b,c{d,e}f}g -> abg acdfg acefg
+// a{b,c}d{e,f}g -> abdeg acdeg abdeg abdfg
+//
+// Invalid sets are not expanded.
+// a{2..}b -> a{2..}b
+// a{b}c -> a{b}c
+minimatch.braceExpand = function (pattern, options) {
+  return braceExpand(pattern, options)
+}
+
+Minimatch.prototype.braceExpand = braceExpand
+
+function braceExpand (pattern, options) {
+  if (!options) {
+    if (this instanceof Minimatch) {
+      options = this.options
+    } else {
+      options = {}
+    }
+  }
+
+  pattern = typeof pattern === 'undefined'
+    ? this.pattern : pattern
+
+  if (typeof pattern === 'undefined') {
+    throw new TypeError('undefined pattern')
+  }
+
+  if (options.nobrace ||
+    !pattern.match(/\{.*\}/)) {
+    // shortcut. no need to expand.
+    return [pattern]
+  }
+
+  return expand(pattern)
+}
+
+// parse a component of the expanded set.
+// At this point, no pattern may contain "/" in it
+// so we're going to return a 2d array, where each entry is the full
+// pattern, split on '/', and then turned into a regular expression.
+// A regexp is made at the end which joins each array with an
+// escaped /, and another full one which joins each regexp with |.
+//
+// Following the lead of Bash 4.1, note that "**" only has special meaning
+// when it is the *only* thing in a path portion.  Otherwise, any series
+// of * is equivalent to a single *.  Globstar behavior is enabled by
+// default, and can be disabled by setting options.noglobstar.
+Minimatch.prototype.parse = parse
+var SUBPARSE = {}
+function parse (pattern, isSub) {
+  if (pattern.length > 1024 * 64) {
+    throw new TypeError('pattern is too long')
+  }
+
+  var options = this.options
+
+  // shortcuts
+  if (!options.noglobstar && pattern === '**') return GLOBSTAR
+  if (pattern === '') return ''
+
+  var re = ''
+  var hasMagic = !!options.nocase
+  var escaping = false
+  // ? => one single character
+  var patternListStack = []
+  var negativeLists = []
+  var stateChar
+  var inClass = false
+  var reClassStart = -1
+  var classStart = -1
+  // . and .. never match anything that doesn't start with .,
+  // even when options.dot is set.
+  var patternStart = pattern.charAt(0) === '.' ? '' // anything
+  // not (start or / followed by . or .. followed by / or end)
+  : options.dot ? '(?!(?:^|\\\/)\\.{1,2}(?:$|\\\/))'
+  : '(?!\\.)'
+  var self = this
+
+  function clearStateChar () {
+    if (stateChar) {
+      // we had some state-tracking character
+      // that wasn't consumed by this pass.
+      switch (stateChar) {
+        case '*':
+          re += star
+          hasMagic = true
+        break
+        case '?':
+          re += qmark
+          hasMagic = true
+        break
+        default:
+          re += '\\' + stateChar
+        break
+      }
+      self.debug('clearStateChar %j %j', stateChar, re)
+      stateChar = false
+    }
+  }
+
+  for (var i = 0, len = pattern.length, c
+    ; (i < len) && (c = pattern.charAt(i))
+    ; i++) {
+    this.debug('%s\t%s %s %j', pattern, i, re, c)
+
+    // skip over any that are escaped.
+    if (escaping && reSpecials[c]) {
+      re += '\\' + c
+      escaping = false
+      continue
+    }
+
+    switch (c) {
+      case '/':
+        // completely not allowed, even escaped.
+        // Should already be path-split by now.
+        return false
+
+      case '\\':
+        clearStateChar()
+        escaping = true
+      continue
+
+      // the various stateChar values
+      // for the "extglob" stuff.
+      case '?':
+      case '*':
+      case '+':
+      case '@':
+      case '!':
+        this.debug('%s\t%s %s %j <-- stateChar', pattern, i, re, c)
+
+        // all of those are literals inside a class, except that
+        // the glob [!a] means [^a] in regexp
+        if (inClass) {
+          this.debug('  in class')
+          if (c === '!' && i === classStart + 1) c = '^'
+          re += c
+          continue
+        }
+
+        // if we already have a stateChar, then it means
+        // that there was something like ** or +? in there.
+        // Handle the stateChar, then proceed with this one.
+        self.debug('call clearStateChar %j', stateChar)
+        clearStateChar()
+        stateChar = c
+        // if extglob is disabled, then +(asdf|foo) isn't a thing.
+        // just clear the statechar *now*, rather than even diving into
+        // the patternList stuff.
+        if (options.noext) clearStateChar()
+      continue
+
+      case '(':
+        if (inClass) {
+          re += '('
+          continue
+        }
+
+        if (!stateChar) {
+          re += '\\('
+          continue
+        }
+
+        patternListStack.push({
+          type: stateChar,
+          start: i - 1,
+          reStart: re.length,
+          open: plTypes[stateChar].open,
+          close: plTypes[stateChar].close
+        })
+        // negation is (?:(?!js)[^/]*)
+        re += stateChar === '!' ? '(?:(?!(?:' : '(?:'
+        this.debug('plType %j %j', stateChar, re)
+        stateChar = false
+      continue
+
+      case ')':
+        if (inClass || !patternListStack.length) {
+          re += '\\)'
+          continue
+        }
+
+        clearStateChar()
+        hasMagic = true
+        var pl = patternListStack.pop()
+        // negation is (?:(?!js)[^/]*)
+        // The others are (?:<pattern>)<type>
+        re += pl.close
+        if (pl.type === '!') {
+          negativeLists.push(pl)
+        }
+        pl.reEnd = re.length
+      continue
+
+      case '|':
+        if (inClass || !patternListStack.length || escaping) {
+          re += '\\|'
+          escaping = false
+          continue
+        }
+
+        clearStateChar()
+        re += '|'
+      continue
+
+      // these are mostly the same in regexp and glob
+      case '[':
+        // swallow any state-tracking char before the [
+        clearStateChar()
+
+        if (inClass) {
+          re += '\\' + c
+          continue
+        }
+
+        inClass = true
+        classStart = i
+        reClassStart = re.length
+        re += c
+      continue
+
+      case ']':
+        //  a right bracket shall lose its special
+        //  meaning and represent itself in
+        //  a bracket expression if it occurs
+        //  first in the list.  -- POSIX.2 2.8.3.2
+        if (i === classStart + 1 || !inClass) {
+          re += '\\' + c
+          escaping = false
+          continue
+        }
+
+        // handle the case where we left a class open.
+        // "[z-a]" is valid, equivalent to "\[z-a\]"
+        if (inClass) {
+          // split where the last [ was, make sure we don't have
+          // an invalid re. if so, re-walk the contents of the
+          // would-be class to re-translate any characters that
+          // were passed through as-is
+          // TODO: It would probably be faster to determine this
+          // without a try/catch and a new RegExp, but it's tricky
+          // to do safely.  For now, this is safe and works.
+          var cs = pattern.substring(classStart + 1, i)
+          try {
+            RegExp('[' + cs + ']')
+          } catch (er) {
+            // not a valid class!
+            var sp = this.parse(cs, SUBPARSE)
+            re = re.substr(0, reClassStart) + '\\[' + sp[0] + '\\]'
+            hasMagic = hasMagic || sp[1]
+            inClass = false
+            continue
+          }
+        }
+
+        // finish up the class.
+        hasMagic = true
+        inClass = false
+        re += c
+      continue
+
+      default:
+        // swallow any state char that wasn't consumed
+        clearStateChar()
+
+        if (escaping) {
+          // no need
+          escaping = false
+        } else if (reSpecials[c]
+          && !(c === '^' && inClass)) {
+          re += '\\'
+        }
+
+        re += c
+
+    } // switch
+  } // for
+
+  // handle the case where we left a class open.
+  // "[abc" is valid, equivalent to "\[abc"
+  if (inClass) {
+    // split where the last [ was, and escape it
+    // this is a huge pita.  We now have to re-walk
+    // the contents of the would-be class to re-translate
+    // any characters that were passed through as-is
+    cs = pattern.substr(classStart + 1)
+    sp = this.parse(cs, SUBPARSE)
+    re = re.substr(0, reClassStart) + '\\[' + sp[0]
+    hasMagic = hasMagic || sp[1]
+  }
+
+  // handle the case where we had a +( thing at the *end*
+  // of the pattern.
+  // each pattern list stack adds 3 chars, and we need to go through
+  // and escape any | chars that were passed through as-is for the regexp.
+  // Go through and escape them, taking care not to double-escape any
+  // | chars that were already escaped.
+  for (pl = patternListStack.pop(); pl; pl = patternListStack.pop()) {
+    var tail = re.slice(pl.reStart + pl.open.length)
+    this.debug('setting tail', re, pl)
+    // maybe some even number of \, then maybe 1 \, followed by a |
+    tail = tail.replace(/((?:\\{2}){0,64})(\\?)\|/g, function (_, $1, $2) {
+      if (!$2) {
+        // the | isn't already escaped, so escape it.
+        $2 = '\\'
+      }
+
+      // need to escape all those slashes *again*, without escaping the
+      // one that we need for escaping the | character.  As it works out,
+      // escaping an even number of slashes can be done by simply repeating
+      // it exactly after itself.  That's why this trick works.
+      //
+      // I am sorry that you have to see this.
+      return $1 + $1 + $2 + '|'
+    })
+
+    this.debug('tail=%j\n   %s', tail, tail, pl, re)
+    var t = pl.type === '*' ? star
+      : pl.type === '?' ? qmark
+      : '\\' + pl.type
+
+    hasMagic = true
+    re = re.slice(0, pl.reStart) + t + '\\(' + tail
+  }
+
+  // handle trailing things that only matter at the very end.
+  clearStateChar()
+  if (escaping) {
+    // trailing \\
+    re += '\\\\'
+  }
+
+  // only need to apply the nodot start if the re starts with
+  // something that could conceivably capture a dot
+  var addPatternStart = false
+  switch (re.charAt(0)) {
+    case '.':
+    case '[':
+    case '(': addPatternStart = true
+  }
+
+  // Hack to work around lack of negative lookbehind in JS
+  // A pattern like: *.!(x).!(y|z) needs to ensure that a name
+  // like 'a.xyz.yz' doesn't match.  So, the first negative
+  // lookahead, has to look ALL the way ahead, to the end of
+  // the pattern.
+  for (var n = negativeLists.length - 1; n > -1; n--) {
+    var nl = negativeLists[n]
+
+    var nlBefore = re.slice(0, nl.reStart)
+    var nlFirst = re.slice(nl.reStart, nl.reEnd - 8)
+    var nlLast = re.slice(nl.reEnd - 8, nl.reEnd)
+    var nlAfter = re.slice(nl.reEnd)
+
+    nlLast += nlAfter
+
+    // Handle nested stuff like *(*.js|!(*.json)), where open parens
+    // mean that we should *not* include the ) in the bit that is considered
+    // "after" the negated section.
+    var openParensBefore = nlBefore.split('(').length - 1
+    var cleanAfter = nlAfter
+    for (i = 0; i < openParensBefore; i++) {
+      cleanAfter = cleanAfter.replace(/\)[+*?]?/, '')
+    }
+    nlAfter = cleanAfter
+
+    var dollar = ''
+    if (nlAfter === '' && isSub !== SUBPARSE) {
+      dollar = '$'
+    }
+    var newRe = nlBefore + nlFirst + nlAfter + dollar + nlLast
+    re = newRe
+  }
+
+  // if the re is not "" at this point, then we need to make sure
+  // it doesn't match against an empty path part.
+  // Otherwise a/* will match a/, which it should not.
+  if (re !== '' && hasMagic) {
+    re = '(?=.)' + re
+  }
+
+  if (addPatternStart) {
+    re = patternStart + re
+  }
+
+  // parsing just a piece of a larger pattern.
+  if (isSub === SUBPARSE) {
+    return [re, hasMagic]
+  }
+
+  // skip the regexp for non-magical patterns
+  // unescape anything in it, though, so that it'll be
+  // an exact match against a file etc.
+  if (!hasMagic) {
+    return globUnescape(pattern)
+  }
+
+  var flags = options.nocase ? 'i' : ''
+  try {
+    var regExp = new RegExp('^' + re + '$', flags)
+  } catch (er) {
+    // If it was an invalid regular expression, then it can't match
+    // anything.  This trick looks for a character after the end of
+    // the string, which is of course impossible, except in multi-line
+    // mode, but it's not a /m regex.
+    return new RegExp('$.')
+  }
+
+  regExp._glob = pattern
+  regExp._src = re
+
+  return regExp
+}
+
+minimatch.makeRe = function (pattern, options) {
+  return new Minimatch(pattern, options || {}).makeRe()
+}
+
+Minimatch.prototype.makeRe = makeRe
+function makeRe () {
+  if (this.regexp || this.regexp === false) return this.regexp
+
+  // at this point, this.set is a 2d array of partial
+  // pattern strings, or "**".
+  //
+  // It's better to use .match().  This function shouldn't
+  // be used, really, but it's pretty convenient sometimes,
+  // when you just want to work with a regex.
+  var set = this.set
+
+  if (!set.length) {
+    this.regexp = false
+    return this.regexp
+  }
+  var options = this.options
+
+  var twoStar = options.noglobstar ? star
+    : options.dot ? twoStarDot
+    : twoStarNoDot
+  var flags = options.nocase ? 'i' : ''
+
+  var re = set.map(function (pattern) {
+    return pattern.map(function (p) {
+      return (p === GLOBSTAR) ? twoStar
+      : (typeof p === 'string') ? regExpEscape(p)
+      : p._src
+    }).join('\\\/')
+  }).join('|')
+
+  // must match entire pattern
+  // ending in a * or ** will make it less strict.
+  re = '^(?:' + re + ')$'
+
+  // can match anything, as long as it's not this.
+  if (this.negate) re = '^(?!' + re + ').*$'
+
+  try {
+    this.regexp = new RegExp(re, flags)
+  } catch (ex) {
+    this.regexp = false
+  }
+  return this.regexp
+}
+
+minimatch.match = function (list, pattern, options) {
+  options = options || {}
+  var mm = new Minimatch(pattern, options)
+  list = list.filter(function (f) {
+    return mm.match(f)
+  })
+  if (mm.options.nonull && !list.length) {
+    list.push(pattern)
+  }
+  return list
+}
+
+Minimatch.prototype.match = match
+function match (f, partial) {
+  this.debug('match', f, this.pattern)
+  // short-circuit in the case of busted things.
+  // comments, etc.
+  if (this.comment) return false
+  if (this.empty) return f === ''
+
+  if (f === '/' && partial) return true
+
+  var options = this.options
+
+  // windows: need to use /, not \
+  if (path.sep !== '/') {
+    f = f.split(path.sep).join('/')
+  }
+
+  // treat the test path as a set of pathparts.
+  f = f.split(slashSplit)
+  this.debug(this.pattern, 'split', f)
+
+  // just ONE of the pattern sets in this.set needs to match
+  // in order for it to be valid.  If negating, then just one
+  // match means that we have failed.
+  // Either way, return on the first hit.
+
+  var set = this.set
+  this.debug(this.pattern, 'set', set)
+
+  // Find the basename of the path by looking for the last non-empty segment
+  var filename
+  var i
+  for (i = f.length - 1; i >= 0; i--) {
+    filename = f[i]
+    if (filename) break
+  }
+
+  for (i = 0; i < set.length; i++) {
+    var pattern = set[i]
+    var file = f
+    if (options.matchBase && pattern.length === 1) {
+      file = [filename]
+    }
+    var hit = this.matchOne(file, pattern, partial)
+    if (hit) {
+      if (options.flipNegate) return true
+      return !this.negate
+    }
+  }
+
+  // didn't get any hits.  this is success if it's a negative
+  // pattern, failure otherwise.
+  if (options.flipNegate) return false
+  return this.negate
+}
+
+// set partial to true to test if, for example,
+// "/a/b" matches the start of "/*/b/*/d"
+// Partial means, if you run out of file before you run
+// out of pattern, then that's fine, as long as all
+// the parts match.
+Minimatch.prototype.matchOne = function (file, pattern, partial) {
+  var options = this.options
+
+  this.debug('matchOne',
+    { 'this': this, file: file, pattern: pattern })
+
+  this.debug('matchOne', file.length, pattern.length)
+
+  for (var fi = 0,
+      pi = 0,
+      fl = file.length,
+      pl = pattern.length
+      ; (fi < fl) && (pi < pl)
+      ; fi++, pi++) {
+    this.debug('matchOne loop')
+    var p = pattern[pi]
+    var f = file[fi]
+
+    this.debug(pattern, p, f)
+
+    // should be impossible.
+    // some invalid regexp stuff in the set.
+    if (p === false) return false
+
+    if (p === GLOBSTAR) {
+      this.debug('GLOBSTAR', [pattern, p, f])
+
+      // "**"
+      // a/**/b/**/c would match the following:
+      // a/b/x/y/z/c
+      // a/x/y/z/b/c
+      // a/b/x/b/x/c
+      // a/b/c
+      // To do this, take the rest of the pattern after
+      // the **, and see if it would match the file remainder.
+      // If so, return success.
+      // If not, the ** "swallows" a segment, and try again.
+      // This is recursively awful.
+      //
+      // a/**/b/**/c matching a/b/x/y/z/c
+      // - a matches a
+      // - doublestar
+      //   - matchOne(b/x/y/z/c, b/**/c)
+      //     - b matches b
+      //     - doublestar
+      //       - matchOne(x/y/z/c, c) -> no
+      //       - matchOne(y/z/c, c) -> no
+      //       - matchOne(z/c, c) -> no
+      //       - matchOne(c, c) yes, hit
+      var fr = fi
+      var pr = pi + 1
+      if (pr === pl) {
+        this.debug('** at the end')
+        // a ** at the end will just swallow the rest.
+        // We have found a match.
+        // however, it will not swallow /.x, unless
+        // options.dot is set.
+        // . and .. are *never* matched by **, for explosively
+        // exponential reasons.
+        for (; fi < fl; fi++) {
+          if (file[fi] === '.' || file[fi] === '..' ||
+            (!options.dot && file[fi].charAt(0) === '.')) return false
+        }
+        return true
+      }
+
+      // ok, let's see if we can swallow whatever we can.
+      while (fr < fl) {
+        var swallowee = file[fr]
+
+        this.debug('\nglobstar while', file, fr, pattern, pr, swallowee)
+
+        // XXX remove this slice.  Just pass the start index.
+        if (this.matchOne(file.slice(fr), pattern.slice(pr), partial)) {
+          this.debug('globstar found match!', fr, fl, swallowee)
+          // found a match.
+          return true
+        } else {
+          // can't swallow "." or ".." ever.
+          // can only swallow ".foo" when explicitly asked.
+          if (swallowee === '.' || swallowee === '..' ||
+            (!options.dot && swallowee.charAt(0) === '.')) {
+            this.debug('dot detected!', file, fr, pattern, pr)
+            break
+          }
+
+          // ** swallows a segment, and continue.
+          this.debug('globstar swallow a segment, and continue')
+          fr++
+        }
+      }
+
+      // no match was found.
+      // However, in partial mode, we can't say this is necessarily over.
+      // If there's more *pattern* left, then
+      if (partial) {
+        // ran out of file
+        this.debug('\n>>> no match, partial?', file, fr, pattern, pr)
+        if (fr === fl) return true
+      }
+      return false
+    }
+
+    // something other than **
+    // non-magic patterns just have to match exactly
+    // patterns with magic have been turned into regexps.
+    var hit
+    if (typeof p === 'string') {
+      if (options.nocase) {
+        hit = f.toLowerCase() === p.toLowerCase()
+      } else {
+        hit = f === p
+      }
+      this.debug('string match', p, f, hit)
+    } else {
+      hit = f.match(p)
+      this.debug('pattern match', p, f, hit)
+    }
+
+    if (!hit) return false
+  }
+
+  // Note: ending in / means that we'll get a final ""
+  // at the end of the pattern.  This can only match a
+  // corresponding "" at the end of the file.
+  // If the file ends in /, then it can only match a
+  // a pattern that ends in /, unless the pattern just
+  // doesn't have any more for it. But, a/b/ should *not*
+  // match "a/b/*", even though "" matches against the
+  // [^/]*? pattern, except in partial mode, where it might
+  // simply not be reached yet.
+  // However, a/b/ should still satisfy a/*
+
+  // now either we fell off the end of the pattern, or we're done.
+  if (fi === fl && pi === pl) {
+    // ran out of pattern and filename at the same time.
+    // an exact hit!
+    return true
+  } else if (fi === fl) {
+    // ran out of file, but still had pattern left.
+    // this is ok if we're doing the match as part of
+    // a glob fs traversal.
+    return partial
+  } else if (pi === pl) {
+    // ran out of pattern, still have file left.
+    // this is only acceptable if we're on the very last
+    // empty segment of a file with a trailing slash.
+    // a/* should match a/b/
+    var emptyFileEnd = (fi === fl - 1) && (file[fi] === '')
+    return emptyFileEnd
+  }
+
+  // should be unreachable.
+  throw new Error('wtf?')
+}
+
+// replace stuff like \* with *
+function globUnescape (s) {
+  return s.replace(/\\(.)/g, '$1')
+}
+
+function regExpEscape (s) {
+  return s.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, '\\$&')
+}
+
+},{"brace-expansion":11,"path":22}],21:[function(require,module,exports){
+var wrappy = require('wrappy')
+module.exports = wrappy(once)
+module.exports.strict = wrappy(onceStrict)
+
+once.proto = once(function () {
+  Object.defineProperty(Function.prototype, 'once', {
+    value: function () {
+      return once(this)
+    },
+    configurable: true
+  })
+
+  Object.defineProperty(Function.prototype, 'onceStrict', {
+    value: function () {
+      return onceStrict(this)
+    },
+    configurable: true
+  })
+})
+
+function once (fn) {
+  var f = function () {
+    if (f.called) return f.value
+    f.called = true
+    return f.value = fn.apply(this, arguments)
+  }
+  f.called = false
+  return f
+}
+
+function onceStrict (fn) {
+  var f = function () {
+    if (f.called)
+      throw new Error(f.onceError)
+    f.called = true
+    return f.value = fn.apply(this, arguments)
+  }
+  var name = fn.name || 'Function wrapped with `once`'
+  f.onceError = name + " shouldn't be called more than once"
+  f.called = false
+  return f
+}
+
+},{"wrappy":29}],22:[function(require,module,exports){
+(function (process){
+// Copyright Joyent, Inc. and other Node contributors.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to permit
+// persons to whom the Software is furnished to do so, subject to the
+// following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
+// NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+// USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+// resolves . and .. elements in a path array with directory names there
+// must be no slashes, empty elements, or device names (c:\) in the array
+// (so also no leading and trailing slashes - it does not distinguish
+// relative and absolute paths)
+function normalizeArray(parts, allowAboveRoot) {
+  // if the path tries to go above the root, `up` ends up > 0
+  var up = 0;
+  for (var i = parts.length - 1; i >= 0; i--) {
+    var last = parts[i];
+    if (last === '.') {
+      parts.splice(i, 1);
+    } else if (last === '..') {
+      parts.splice(i, 1);
+      up++;
+    } else if (up) {
+      parts.splice(i, 1);
+      up--;
+    }
+  }
+
+  // if the path is allowed to go above the root, restore leading ..s
+  if (allowAboveRoot) {
+    for (; up--; up) {
+      parts.unshift('..');
+    }
+  }
+
+  return parts;
+}
+
+// Split a filename into [root, dir, basename, ext], unix version
+// 'root' is just a slash, or nothing.
+var splitPathRe =
+    /^(\/?|)([\s\S]*?)((?:\.{1,2}|[^\/]+?|)(\.[^.\/]*|))(?:[\/]*)$/;
+var splitPath = function(filename) {
+  return splitPathRe.exec(filename).slice(1);
+};
+
+// path.resolve([from ...], to)
+// posix version
+exports.resolve = function() {
+  var resolvedPath = '',
+      resolvedAbsolute = false;
+
+  for (var i = arguments.length - 1; i >= -1 && !resolvedAbsolute; i--) {
+    var path = (i >= 0) ? arguments[i] : process.cwd();
+
+    // Skip empty and invalid entries
+    if (typeof path !== 'string') {
+      throw new TypeError('Arguments to path.resolve must be strings');
+    } else if (!path) {
+      continue;
+    }
+
+    resolvedPath = path + '/' + resolvedPath;
+    resolvedAbsolute = path.charAt(0) === '/';
+  }
+
+  // At this point the path should be resolved to a full absolute path, but
+  // handle relative paths to be safe (might happen when process.cwd() fails)
+
+  // Normalize the path
+  resolvedPath = normalizeArray(filter(resolvedPath.split('/'), function(p) {
+    return !!p;
+  }), !resolvedAbsolute).join('/');
+
+  return ((resolvedAbsolute ? '/' : '') + resolvedPath) || '.';
+};
+
+// path.normalize(path)
+// posix version
+exports.normalize = function(path) {
+  var isAbsolute = exports.isAbsolute(path),
+      trailingSlash = substr(path, -1) === '/';
+
+  // Normalize the path
+  path = normalizeArray(filter(path.split('/'), function(p) {
+    return !!p;
+  }), !isAbsolute).join('/');
+
+  if (!path && !isAbsolute) {
+    path = '.';
+  }
+  if (path && trailingSlash) {
+    path += '/';
+  }
+
+  return (isAbsolute ? '/' : '') + path;
+};
+
+// posix version
+exports.isAbsolute = function(path) {
+  return path.charAt(0) === '/';
+};
+
+// posix version
+exports.join = function() {
+  var paths = Array.prototype.slice.call(arguments, 0);
+  return exports.normalize(filter(paths, function(p, index) {
+    if (typeof p !== 'string') {
+      throw new TypeError('Arguments to path.join must be strings');
+    }
+    return p;
+  }).join('/'));
+};
+
+
+// path.relative(from, to)
+// posix version
+exports.relative = function(from, to) {
+  from = exports.resolve(from).substr(1);
+  to = exports.resolve(to).substr(1);
+
+  function trim(arr) {
+    var start = 0;
+    for (; start < arr.length; start++) {
+      if (arr[start] !== '') break;
+    }
+
+    var end = arr.length - 1;
+    for (; end >= 0; end--) {
+      if (arr[end] !== '') break;
+    }
+
+    if (start > end) return [];
+    return arr.slice(start, end - start + 1);
+  }
+
+  var fromParts = trim(from.split('/'));
+  var toParts = trim(to.split('/'));
+
+  var length = Math.min(fromParts.length, toParts.length);
+  var samePartsLength = length;
+  for (var i = 0; i < length; i++) {
+    if (fromParts[i] !== toParts[i]) {
+      samePartsLength = i;
+      break;
+    }
+  }
+
+  var outputParts = [];
+  for (var i = samePartsLength; i < fromParts.length; i++) {
+    outputParts.push('..');
+  }
+
+  outputParts = outputParts.concat(toParts.slice(samePartsLength));
+
+  return outputParts.join('/');
+};
+
+exports.sep = '/';
+exports.delimiter = ':';
+
+exports.dirname = function(path) {
+  var result = splitPath(path),
+      root = result[0],
+      dir = result[1];
+
+  if (!root && !dir) {
+    // No dirname whatsoever
+    return '.';
+  }
+
+  if (dir) {
+    // It has a dirname, strip trailing slash
+    dir = dir.substr(0, dir.length - 1);
+  }
+
+  return root + dir;
+};
+
+
+exports.basename = function(path, ext) {
+  var f = splitPath(path)[2];
+  // TODO: make this comparison case-insensitive on windows?
+  if (ext && f.substr(-1 * ext.length) === ext) {
+    f = f.substr(0, f.length - ext.length);
+  }
+  return f;
+};
+
+
+exports.extname = function(path) {
+  return splitPath(path)[3];
+};
+
+function filter (xs, f) {
+    if (xs.filter) return xs.filter(f);
+    var res = [];
+    for (var i = 0; i < xs.length; i++) {
+        if (f(xs[i], i, xs)) res.push(xs[i]);
+    }
+    return res;
+}
+
+// String.prototype.substr - negative index don't work in IE8
+var substr = 'ab'.substr(-1) === 'b'
+    ? function (str, start, len) { return str.substr(start, len) }
+    : function (str, start, len) {
+        if (start < 0) start = str.length + start;
+        return str.substr(start, len);
+    }
+;
+
+}).call(this,require('_process'))
+},{"_process":24}],23:[function(require,module,exports){
+(function (process){
+'use strict';
+
+function posix(path) {
+	return path.charAt(0) === '/';
+}
+
+function win32(path) {
+	// https://github.com/nodejs/node/blob/b3fcc245fb25539909ef1d5eaa01dbf92e168633/lib/path.js#L56
+	var splitDeviceRe = /^([a-zA-Z]:|[\\\/]{2}[^\\\/]+[\\\/]+[^\\\/]+)?([\\\/])?([\s\S]*?)$/;
+	var result = splitDeviceRe.exec(path);
+	var device = result[1] || '';
+	var isUnc = Boolean(device && device.charAt(1) !== ':');
+
+	// UNC paths are always absolute
+	return Boolean(result[2] || isUnc);
+}
+
+module.exports = process.platform === 'win32' ? win32 : posix;
+module.exports.posix = posix;
+module.exports.win32 = win32;
+
+}).call(this,require('_process'))
+},{"_process":24}],24:[function(require,module,exports){
+// shim for using process in browser
+var process = module.exports = {};
+
+// cached from whatever global is present so that test runners that stub it
+// don't break things.  But we need to wrap it in a try catch in case it is
+// wrapped in strict mode code which doesn't define any globals.  It's inside a
+// function because try/catches deoptimize in certain engines.
+
+var cachedSetTimeout;
+var cachedClearTimeout;
+
+function defaultSetTimout() {
+    throw new Error('setTimeout has not been defined');
+}
+function defaultClearTimeout () {
+    throw new Error('clearTimeout has not been defined');
+}
+(function () {
+    try {
+        if (typeof setTimeout === 'function') {
+            cachedSetTimeout = setTimeout;
+        } else {
+            cachedSetTimeout = defaultSetTimout;
+        }
+    } catch (e) {
+        cachedSetTimeout = defaultSetTimout;
+    }
+    try {
+        if (typeof clearTimeout === 'function') {
+            cachedClearTimeout = clearTimeout;
+        } else {
+            cachedClearTimeout = defaultClearTimeout;
+        }
+    } catch (e) {
+        cachedClearTimeout = defaultClearTimeout;
+    }
+} ())
+function runTimeout(fun) {
+    if (cachedSetTimeout === setTimeout) {
+        //normal enviroments in sane situations
+        return setTimeout(fun, 0);
+    }
+    // if setTimeout wasn't available but was latter defined
+    if ((cachedSetTimeout === defaultSetTimout || !cachedSetTimeout) && setTimeout) {
+        cachedSetTimeout = setTimeout;
+        return setTimeout(fun, 0);
+    }
+    try {
+        // when when somebody has screwed with setTimeout but no I.E. maddness
+        return cachedSetTimeout(fun, 0);
+    } catch(e){
+        try {
+            // When we are in I.E. but the script has been evaled so I.E. doesn't trust the global object when called normally
+            return cachedSetTimeout.call(null, fun, 0);
+        } catch(e){
+            // same as above but when it's a version of I.E. that must have the global object for 'this', hopfully our context correct otherwise it will throw a global error
+            return cachedSetTimeout.call(this, fun, 0);
+        }
+    }
+
+
+}
+function runClearTimeout(marker) {
+    if (cachedClearTimeout === clearTimeout) {
+        //normal enviroments in sane situations
+        return clearTimeout(marker);
+    }
+    // if clearTimeout wasn't available but was latter defined
+    if ((cachedClearTimeout === defaultClearTimeout || !cachedClearTimeout) && clearTimeout) {
+        cachedClearTimeout = clearTimeout;
+        return clearTimeout(marker);
+    }
+    try {
+        // when when somebody has screwed with setTimeout but no I.E. maddness
+        return cachedClearTimeout(marker);
+    } catch (e){
+        try {
+            // When we are in I.E. but the script has been evaled so I.E. doesn't  trust the global object when called normally
+            return cachedClearTimeout.call(null, marker);
+        } catch (e){
+            // same as above but when it's a version of I.E. that must have the global object for 'this', hopfully our context correct otherwise it will throw a global error.
+            // Some versions of I.E. have different rules for clearTimeout vs setTimeout
+            return cachedClearTimeout.call(this, marker);
+        }
+    }
+
+
+
+}
+var queue = [];
+var draining = false;
+var currentQueue;
+var queueIndex = -1;
+
+function cleanUpNextTick() {
+    if (!draining || !currentQueue) {
+        return;
+    }
+    draining = false;
+    if (currentQueue.length) {
+        queue = currentQueue.concat(queue);
+    } else {
+        queueIndex = -1;
+    }
+    if (queue.length) {
+        drainQueue();
+    }
+}
+
+function drainQueue() {
+    if (draining) {
+        return;
+    }
+    var timeout = runTimeout(cleanUpNextTick);
+    draining = true;
+
+    var len = queue.length;
+    while(len) {
+        currentQueue = queue;
+        queue = [];
+        while (++queueIndex < len) {
+            if (currentQueue) {
+                currentQueue[queueIndex].run();
+            }
+        }
+        queueIndex = -1;
+        len = queue.length;
+    }
+    currentQueue = null;
+    draining = false;
+    runClearTimeout(timeout);
+}
+
+process.nextTick = function (fun) {
+    var args = new Array(arguments.length - 1);
+    if (arguments.length > 1) {
+        for (var i = 1; i < arguments.length; i++) {
+            args[i - 1] = arguments[i];
+        }
+    }
+    queue.push(new Item(fun, args));
+    if (queue.length === 1 && !draining) {
+        runTimeout(drainQueue);
+    }
+};
+
+// v8 likes predictible objects
+function Item(fun, array) {
+    this.fun = fun;
+    this.array = array;
+}
+Item.prototype.run = function () {
+    this.fun.apply(null, this.array);
+};
+process.title = 'browser';
+process.browser = true;
+process.env = {};
+process.argv = [];
+process.version = ''; // empty string to avoid regexp issues
+process.versions = {};
+
+function noop() {}
+
+process.on = noop;
+process.addListener = noop;
+process.once = noop;
+process.off = noop;
+process.removeListener = noop;
+process.removeAllListeners = noop;
+process.emit = noop;
+process.prependListener = noop;
+process.prependOnceListener = noop;
+
+process.listeners = function (name) { return [] }
+
+process.binding = function (name) {
+    throw new Error('process.binding is not supported');
+};
+
+process.cwd = function () { return '/' };
+process.chdir = function (dir) {
+    throw new Error('process.chdir is not supported');
+};
+process.umask = function() { return 0; };
+
+},{}],25:[function(require,module,exports){
+//     Underscore.js 1.8.3
+//     http://underscorejs.org
+//     (c) 2009-2015 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
+//     Underscore may be freely distributed under the MIT license.
+
+(function() {
+
+  // Baseline setup
+  // --------------
+
+  // Establish the root object, `window` in the browser, or `exports` on the server.
+  var root = this;
+
+  // Save the previous value of the `_` variable.
+  var previousUnderscore = root._;
+
+  // Save bytes in the minified (but not gzipped) version:
+  var ArrayProto = Array.prototype, ObjProto = Object.prototype, FuncProto = Function.prototype;
+
+  // Create quick reference variables for speed access to core prototypes.
+  var
+    push             = ArrayProto.push,
+    slice            = ArrayProto.slice,
+    toString         = ObjProto.toString,
+    hasOwnProperty   = ObjProto.hasOwnProperty;
+
+  // All **ECMAScript 5** native function implementations that we hope to use
+  // are declared here.
+  var
+    nativeIsArray      = Array.isArray,
+    nativeKeys         = Object.keys,
+    nativeBind         = FuncProto.bind,
+    nativeCreate       = Object.create;
+
+  // Naked function reference for surrogate-prototype-swapping.
+  var Ctor = function(){};
+
+  // Create a safe reference to the Underscore object for use below.
+  var _ = function(obj) {
+    if (obj instanceof _) return obj;
+    if (!(this instanceof _)) return new _(obj);
+    this._wrapped = obj;
+  };
+
+  // Export the Underscore object for **Node.js**, with
+  // backwards-compatibility for the old `require()` API. If we're in
+  // the browser, add `_` as a global object.
+  if (typeof exports !== 'undefined') {
+    if (typeof module !== 'undefined' && module.exports) {
+      exports = module.exports = _;
+    }
+    exports._ = _;
+  } else {
+    root._ = _;
+  }
+
+  // Current version.
+  _.VERSION = '1.8.3';
+
+  // Internal function that returns an efficient (for current engines) version
+  // of the passed-in callback, to be repeatedly applied in other Underscore
+  // functions.
+  var optimizeCb = function(func, context, argCount) {
+    if (context === void 0) return func;
+    switch (argCount == null ? 3 : argCount) {
+      case 1: return function(value) {
+        return func.call(context, value);
+      };
+      case 2: return function(value, other) {
+        return func.call(context, value, other);
+      };
+      case 3: return function(value, index, collection) {
+        return func.call(context, value, index, collection);
+      };
+      case 4: return function(accumulator, value, index, collection) {
+        return func.call(context, accumulator, value, index, collection);
+      };
+    }
+    return function() {
+      return func.apply(context, arguments);
+    };
+  };
+
+  // A mostly-internal function to generate callbacks that can be applied
+  // to each element in a collection, returning the desired result — either
+  // identity, an arbitrary callback, a property matcher, or a property accessor.
+  var cb = function(value, context, argCount) {
+    if (value == null) return _.identity;
+    if (_.isFunction(value)) return optimizeCb(value, context, argCount);
+    if (_.isObject(value)) return _.matcher(value);
+    return _.property(value);
+  };
+  _.iteratee = function(value, context) {
+    return cb(value, context, Infinity);
+  };
+
+  // An internal function for creating assigner functions.
+  var createAssigner = function(keysFunc, undefinedOnly) {
+    return function(obj) {
+      var length = arguments.length;
+      if (length < 2 || obj == null) return obj;
+      for (var index = 1; index < length; index++) {
+        var source = arguments[index],
+            keys = keysFunc(source),
+            l = keys.length;
+        for (var i = 0; i < l; i++) {
+          var key = keys[i];
+          if (!undefinedOnly || obj[key] === void 0) obj[key] = source[key];
+        }
+      }
+      return obj;
+    };
+  };
+
+  // An internal function for creating a new object that inherits from another.
+  var baseCreate = function(prototype) {
+    if (!_.isObject(prototype)) return {};
+    if (nativeCreate) return nativeCreate(prototype);
+    Ctor.prototype = prototype;
+    var result = new Ctor;
+    Ctor.prototype = null;
+    return result;
+  };
+
+  var property = function(key) {
+    return function(obj) {
+      return obj == null ? void 0 : obj[key];
+    };
+  };
+
+  // Helper for collection methods to determine whether a collection
+  // should be iterated as an array or as an object
+  // Related: http://people.mozilla.org/~jorendorff/es6-draft.html#sec-tolength
+  // Avoids a very nasty iOS 8 JIT bug on ARM-64. #2094
+  var MAX_ARRAY_INDEX = Math.pow(2, 53) - 1;
+  var getLength = property('length');
+  var isArrayLike = function(collection) {
+    var length = getLength(collection);
+    return typeof length == 'number' && length >= 0 && length <= MAX_ARRAY_INDEX;
+  };
+
+  // Collection Functions
+  // --------------------
+
+  // The cornerstone, an `each` implementation, aka `forEach`.
+  // Handles raw objects in addition to array-likes. Treats all
+  // sparse array-likes as if they were dense.
+  _.each = _.forEach = function(obj, iteratee, context) {
+    iteratee = optimizeCb(iteratee, context);
+    var i, length;
+    if (isArrayLike(obj)) {
+      for (i = 0, length = obj.length; i < length; i++) {
+        iteratee(obj[i], i, obj);
+      }
+    } else {
+      var keys = _.keys(obj);
+      for (i = 0, length = keys.length; i < length; i++) {
+        iteratee(obj[keys[i]], keys[i], obj);
+      }
+    }
+    return obj;
+  };
+
+  // Return the results of applying the iteratee to each element.
+  _.map = _.collect = function(obj, iteratee, context) {
+    iteratee = cb(iteratee, context);
+    var keys = !isArrayLike(obj) && _.keys(obj),
+        length = (keys || obj).length,
+        results = Array(length);
+    for (var index = 0; index < length; index++) {
+      var currentKey = keys ? keys[index] : index;
+      results[index] = iteratee(obj[currentKey], currentKey, obj);
+    }
+    return results;
+  };
+
+  // Create a reducing function iterating left or right.
+  function createReduce(dir) {
+    // Optimized iterator function as using arguments.length
+    // in the main function will deoptimize the, see #1991.
+    function iterator(obj, iteratee, memo, keys, index, length) {
+      for (; index >= 0 && index < length; index += dir) {
+        var currentKey = keys ? keys[index] : index;
+        memo = iteratee(memo, obj[currentKey], currentKey, obj);
+      }
+      return memo;
+    }
+
+    return function(obj, iteratee, memo, context) {
+      iteratee = optimizeCb(iteratee, context, 4);
+      var keys = !isArrayLike(obj) && _.keys(obj),
+          length = (keys || obj).length,
+          index = dir > 0 ? 0 : length - 1;
+      // Determine the initial value if none is provided.
+      if (arguments.length < 3) {
+        memo = obj[keys ? keys[index] : index];
+        index += dir;
+      }
+      return iterator(obj, iteratee, memo, keys, index, length);
+    };
+  }
+
+  // **Reduce** builds up a single result from a list of values, aka `inject`,
+  // or `foldl`.
+  _.reduce = _.foldl = _.inject = createReduce(1);
+
+  // The right-associative version of reduce, also known as `foldr`.
+  _.reduceRight = _.foldr = createReduce(-1);
+
+  // Return the first value which passes a truth test. Aliased as `detect`.
+  _.find = _.detect = function(obj, predicate, context) {
+    var key;
+    if (isArrayLike(obj)) {
+      key = _.findIndex(obj, predicate, context);
+    } else {
+      key = _.findKey(obj, predicate, context);
+    }
+    if (key !== void 0 && key !== -1) return obj[key];
+  };
+
+  // Return all the elements that pass a truth test.
+  // Aliased as `select`.
+  _.filter = _.select = function(obj, predicate, context) {
+    var results = [];
+    predicate = cb(predicate, context);
+    _.each(obj, function(value, index, list) {
+      if (predicate(value, index, list)) results.push(value);
+    });
+    return results;
+  };
+
+  // Return all the elements for which a truth test fails.
+  _.reject = function(obj, predicate, context) {
+    return _.filter(obj, _.negate(cb(predicate)), context);
+  };
+
+  // Determine whether all of the elements match a truth test.
+  // Aliased as `all`.
+  _.every = _.all = function(obj, predicate, context) {
+    predicate = cb(predicate, context);
+    var keys = !isArrayLike(obj) && _.keys(obj),
+        length = (keys || obj).length;
+    for (var index = 0; index < length; index++) {
+      var currentKey = keys ? keys[index] : index;
+      if (!predicate(obj[currentKey], currentKey, obj)) return false;
+    }
+    return true;
+  };
+
+  // Determine if at least one element in the object matches a truth test.
+  // Aliased as `any`.
+  _.some = _.any = function(obj, predicate, context) {
+    predicate = cb(predicate, context);
+    var keys = !isArrayLike(obj) && _.keys(obj),
+        length = (keys || obj).length;
+    for (var index = 0; index < length; index++) {
+      var currentKey = keys ? keys[index] : index;
+      if (predicate(obj[currentKey], currentKey, obj)) return true;
+    }
+    return false;
+  };
+
+  // Determine if the array or object contains a given item (using `===`).
+  // Aliased as `includes` and `include`.
+  _.contains = _.includes = _.include = function(obj, item, fromIndex, guard) {
+    if (!isArrayLike(obj)) obj = _.values(obj);
+    if (typeof fromIndex != 'number' || guard) fromIndex = 0;
+    return _.indexOf(obj, item, fromIndex) >= 0;
+  };
+
+  // Invoke a method (with arguments) on every item in a collection.
+  _.invoke = function(obj, method) {
+    var args = slice.call(arguments, 2);
+    var isFunc = _.isFunction(method);
+    return _.map(obj, function(value) {
+      var func = isFunc ? method : value[method];
+      return func == null ? func : func.apply(value, args);
+    });
+  };
+
+  // Convenience version of a common use case of `map`: fetching a property.
+  _.pluck = function(obj, key) {
+    return _.map(obj, _.property(key));
+  };
+
+  // Convenience version of a common use case of `filter`: selecting only objects
+  // containing specific `key:value` pairs.
+  _.where = function(obj, attrs) {
+    return _.filter(obj, _.matcher(attrs));
+  };
+
+  // Convenience version of a common use case of `find`: getting the first object
+  // containing specific `key:value` pairs.
+  _.findWhere = function(obj, attrs) {
+    return _.find(obj, _.matcher(attrs));
+  };
+
+  // Return the maximum element (or element-based computation).
+  _.max = function(obj, iteratee, context) {
+    var result = -Infinity, lastComputed = -Infinity,
+        value, computed;
+    if (iteratee == null && obj != null) {
+      obj = isArrayLike(obj) ? obj : _.values(obj);
+      for (var i = 0, length = obj.length; i < length; i++) {
+        value = obj[i];
+        if (value > result) {
+          result = value;
+        }
+      }
+    } else {
+      iteratee = cb(iteratee, context);
+      _.each(obj, function(value, index, list) {
+        computed = iteratee(value, index, list);
+        if (computed > lastComputed || computed === -Infinity && result === -Infinity) {
+          result = value;
+          lastComputed = computed;
+        }
+      });
+    }
+    return result;
+  };
+
+  // Return the minimum element (or element-based computation).
+  _.min = function(obj, iteratee, context) {
+    var result = Infinity, lastComputed = Infinity,
+        value, computed;
+    if (iteratee == null && obj != null) {
+      obj = isArrayLike(obj) ? obj : _.values(obj);
+      for (var i = 0, length = obj.length; i < length; i++) {
+        value = obj[i];
+        if (value < result) {
+          result = value;
+        }
+      }
+    } else {
+      iteratee = cb(iteratee, context);
+      _.each(obj, function(value, index, list) {
+        computed = iteratee(value, index, list);
+        if (computed < lastComputed || computed === Infinity && result === Infinity) {
+          result = value;
+          lastComputed = computed;
+        }
+      });
+    }
+    return result;
+  };
+
+  // Shuffle a collection, using the modern version of the
+  // [Fisher-Yates shuffle](http://en.wikipedia.org/wiki/Fisher–Yates_shuffle).
+  _.shuffle = function(obj) {
+    var set = isArrayLike(obj) ? obj : _.values(obj);
+    var length = set.length;
+    var shuffled = Array(length);
+    for (var index = 0, rand; index < length; index++) {
+      rand = _.random(0, index);
+      if (rand !== index) shuffled[index] = shuffled[rand];
+      shuffled[rand] = set[index];
+    }
+    return shuffled;
+  };
+
+  // Sample **n** random values from a collection.
+  // If **n** is not specified, returns a single random element.
+  // The internal `guard` argument allows it to work with `map`.
+  _.sample = function(obj, n, guard) {
+    if (n == null || guard) {
+      if (!isArrayLike(obj)) obj = _.values(obj);
+      return obj[_.random(obj.length - 1)];
+    }
+    return _.shuffle(obj).slice(0, Math.max(0, n));
+  };
+
+  // Sort the object's values by a criterion produced by an iteratee.
+  _.sortBy = function(obj, iteratee, context) {
+    iteratee = cb(iteratee, context);
+    return _.pluck(_.map(obj, function(value, index, list) {
+      return {
+        value: value,
+        index: index,
+        criteria: iteratee(value, index, list)
+      };
+    }).sort(function(left, right) {
+      var a = left.criteria;
+      var b = right.criteria;
+      if (a !== b) {
+        if (a > b || a === void 0) return 1;
+        if (a < b || b === void 0) return -1;
+      }
+      return left.index - right.index;
+    }), 'value');
+  };
+
+  // An internal function used for aggregate "group by" operations.
+  var group = function(behavior) {
+    return function(obj, iteratee, context) {
+      var result = {};
+      iteratee = cb(iteratee, context);
+      _.each(obj, function(value, index) {
+        var key = iteratee(value, index, obj);
+        behavior(result, value, key);
+      });
+      return result;
+    };
+  };
+
+  // Groups the object's values by a criterion. Pass either a string attribute
+  // to group by, or a function that returns the criterion.
+  _.groupBy = group(function(result, value, key) {
+    if (_.has(result, key)) result[key].push(value); else result[key] = [value];
+  });
+
+  // Indexes the object's values by a criterion, similar to `groupBy`, but for
+  // when you know that your index values will be unique.
+  _.indexBy = group(function(result, value, key) {
+    result[key] = value;
+  });
+
+  // Counts instances of an object that group by a certain criterion. Pass
+  // either a string attribute to count by, or a function that returns the
+  // criterion.
+  _.countBy = group(function(result, value, key) {
+    if (_.has(result, key)) result[key]++; else result[key] = 1;
+  });
+
+  // Safely create a real, live array from anything iterable.
+  _.toArray = function(obj) {
+    if (!obj) return [];
+    if (_.isArray(obj)) return slice.call(obj);
+    if (isArrayLike(obj)) return _.map(obj, _.identity);
+    return _.values(obj);
+  };
+
+  // Return the number of elements in an object.
+  _.size = function(obj) {
+    if (obj == null) return 0;
+    return isArrayLike(obj) ? obj.length : _.keys(obj).length;
+  };
+
+  // Split a collection into two arrays: one whose elements all satisfy the given
+  // predicate, and one whose elements all do not satisfy the predicate.
+  _.partition = function(obj, predicate, context) {
+    predicate = cb(predicate, context);
+    var pass = [], fail = [];
+    _.each(obj, function(value, key, obj) {
+      (predicate(value, key, obj) ? pass : fail).push(value);
+    });
+    return [pass, fail];
+  };
+
+  // Array Functions
+  // ---------------
+
+  // Get the first element of an array. Passing **n** will return the first N
+  // values in the array. Aliased as `head` and `take`. The **guard** check
+  // allows it to work with `_.map`.
+  _.first = _.head = _.take = function(array, n, guard) {
+    if (array == null) return void 0;
+    if (n == null || guard) return array[0];
+    return _.initial(array, array.length - n);
+  };
+
+  // Returns everything but the last entry of the array. Especially useful on
+  // the arguments object. Passing **n** will return all the values in
+  // the array, excluding the last N.
+  _.initial = function(array, n, guard) {
+    return slice.call(array, 0, Math.max(0, array.length - (n == null || guard ? 1 : n)));
+  };
+
+  // Get the last element of an array. Passing **n** will return the last N
+  // values in the array.
+  _.last = function(array, n, guard) {
+    if (array == null) return void 0;
+    if (n == null || guard) return array[array.length - 1];
+    return _.rest(array, Math.max(0, array.length - n));
+  };
+
+  // Returns everything but the first entry of the array. Aliased as `tail` and `drop`.
+  // Especially useful on the arguments object. Passing an **n** will return
+  // the rest N values in the array.
+  _.rest = _.tail = _.drop = function(array, n, guard) {
+    return slice.call(array, n == null || guard ? 1 : n);
+  };
+
+  // Trim out all falsy values from an array.
+  _.compact = function(array) {
+    return _.filter(array, _.identity);
+  };
+
+  // Internal implementation of a recursive `flatten` function.
+  var flatten = function(input, shallow, strict, startIndex) {
+    var output = [], idx = 0;
+    for (var i = startIndex || 0, length = getLength(input); i < length; i++) {
+      var value = input[i];
+      if (isArrayLike(value) && (_.isArray(value) || _.isArguments(value))) {
+        //flatten current level of array or arguments object
+        if (!shallow) value = flatten(value, shallow, strict);
+        var j = 0, len = value.length;
+        output.length += len;
+        while (j < len) {
+          output[idx++] = value[j++];
+        }
+      } else if (!strict) {
+        output[idx++] = value;
+      }
+    }
+    return output;
+  };
+
+  // Flatten out an array, either recursively (by default), or just one level.
+  _.flatten = function(array, shallow) {
+    return flatten(array, shallow, false);
+  };
+
+  // Return a version of the array that does not contain the specified value(s).
+  _.without = function(array) {
+    return _.difference(array, slice.call(arguments, 1));
+  };
+
+  // Produce a duplicate-free version of the array. If the array has already
+  // been sorted, you have the option of using a faster algorithm.
+  // Aliased as `unique`.
+  _.uniq = _.unique = function(array, isSorted, iteratee, context) {
+    if (!_.isBoolean(isSorted)) {
+      context = iteratee;
+      iteratee = isSorted;
+      isSorted = false;
+    }
+    if (iteratee != null) iteratee = cb(iteratee, context);
+    var result = [];
+    var seen = [];
+    for (var i = 0, length = getLength(array); i < length; i++) {
+      var value = array[i],
+          computed = iteratee ? iteratee(value, i, array) : value;
+      if (isSorted) {
+        if (!i || seen !== computed) result.push(value);
+        seen = computed;
+      } else if (iteratee) {
+        if (!_.contains(seen, computed)) {
+          seen.push(computed);
+          result.push(value);
+        }
+      } else if (!_.contains(result, value)) {
+        result.push(value);
+      }
+    }
+    return result;
+  };
+
+  // Produce an array that contains the union: each distinct element from all of
+  // the passed-in arrays.
+  _.union = function() {
+    return _.uniq(flatten(arguments, true, true));
+  };
+
+  // Produce an array that contains every item shared between all the
+  // passed-in arrays.
+  _.intersection = function(array) {
+    var result = [];
+    var argsLength = arguments.length;
+    for (var i = 0, length = getLength(array); i < length; i++) {
+      var item = array[i];
+      if (_.contains(result, item)) continue;
+      for (var j = 1; j < argsLength; j++) {
+        if (!_.contains(arguments[j], item)) break;
+      }
+      if (j === argsLength) result.push(item);
+    }
+    return result;
+  };
+
+  // Take the difference between one array and a number of other arrays.
+  // Only the elements present in just the first array will remain.
+  _.difference = function(array) {
+    var rest = flatten(arguments, true, true, 1);
+    return _.filter(array, function(value){
+      return !_.contains(rest, value);
+    });
+  };
+
+  // Zip together multiple lists into a single array -- elements that share
+  // an index go together.
+  _.zip = function() {
+    return _.unzip(arguments);
+  };
+
+  // Complement of _.zip. Unzip accepts an array of arrays and groups
+  // each array's elements on shared indices
+  _.unzip = function(array) {
+    var length = array && _.max(array, getLength).length || 0;
+    var result = Array(length);
+
+    for (var index = 0; index < length; index++) {
+      result[index] = _.pluck(array, index);
+    }
+    return result;
+  };
+
+  // Converts lists into objects. Pass either a single array of `[key, value]`
+  // pairs, or two parallel arrays of the same length -- one of keys, and one of
+  // the corresponding values.
+  _.object = function(list, values) {
+    var result = {};
+    for (var i = 0, length = getLength(list); i < length; i++) {
+      if (values) {
+        result[list[i]] = values[i];
+      } else {
+        result[list[i][0]] = list[i][1];
+      }
+    }
+    return result;
+  };
+
+  // Generator function to create the findIndex and findLastIndex functions
+  function createPredicateIndexFinder(dir) {
+    return function(array, predicate, context) {
+      predicate = cb(predicate, context);
+      var length = getLength(array);
+      var index = dir > 0 ? 0 : length - 1;
+      for (; index >= 0 && index < length; index += dir) {
+        if (predicate(array[index], index, array)) return index;
+      }
+      return -1;
+    };
+  }
+
+  // Returns the first index on an array-like that passes a predicate test
+  _.findIndex = createPredicateIndexFinder(1);
+  _.findLastIndex = createPredicateIndexFinder(-1);
+
+  // Use a comparator function to figure out the smallest index at which
+  // an object should be inserted so as to maintain order. Uses binary search.
+  _.sortedIndex = function(array, obj, iteratee, context) {
+    iteratee = cb(iteratee, context, 1);
+    var value = iteratee(obj);
+    var low = 0, high = getLength(array);
+    while (low < high) {
+      var mid = Math.floor((low + high) / 2);
+      if (iteratee(array[mid]) < value) low = mid + 1; else high = mid;
+    }
+    return low;
+  };
+
+  // Generator function to create the indexOf and lastIndexOf functions
+  function createIndexFinder(dir, predicateFind, sortedIndex) {
+    return function(array, item, idx) {
+      var i = 0, length = getLength(array);
+      if (typeof idx == 'number') {
+        if (dir > 0) {
+            i = idx >= 0 ? idx : Math.max(idx + length, i);
+        } else {
+            length = idx >= 0 ? Math.min(idx + 1, length) : idx + length + 1;
+        }
+      } else if (sortedIndex && idx && length) {
+        idx = sortedIndex(array, item);
+        return array[idx] === item ? idx : -1;
+      }
+      if (item !== item) {
+        idx = predicateFind(slice.call(array, i, length), _.isNaN);
+        return idx >= 0 ? idx + i : -1;
+      }
+      for (idx = dir > 0 ? i : length - 1; idx >= 0 && idx < length; idx += dir) {
+        if (array[idx] === item) return idx;
+      }
+      return -1;
+    };
+  }
+
+  // Return the position of the first occurrence of an item in an array,
+  // or -1 if the item is not included in the array.
+  // If the array is large and already in sort order, pass `true`
+  // for **isSorted** to use binary search.
+  _.indexOf = createIndexFinder(1, _.findIndex, _.sortedIndex);
+  _.lastIndexOf = createIndexFinder(-1, _.findLastIndex);
+
+  // Generate an integer Array containing an arithmetic progression. A port of
+  // the native Python `range()` function. See
+  // [the Python documentation](http://docs.python.org/library/functions.html#range).
+  _.range = function(start, stop, step) {
+    if (stop == null) {
+      stop = start || 0;
+      start = 0;
+    }
+    step = step || 1;
+
+    var length = Math.max(Math.ceil((stop - start) / step), 0);
+    var range = Array(length);
+
+    for (var idx = 0; idx < length; idx++, start += step) {
+      range[idx] = start;
+    }
+
+    return range;
+  };
+
+  // Function (ahem) Functions
+  // ------------------
+
+  // Determines whether to execute a function as a constructor
+  // or a normal function with the provided arguments
+  var executeBound = function(sourceFunc, boundFunc, context, callingContext, args) {
+    if (!(callingContext instanceof boundFunc)) return sourceFunc.apply(context, args);
+    var self = baseCreate(sourceFunc.prototype);
+    var result = sourceFunc.apply(self, args);
+    if (_.isObject(result)) return result;
+    return self;
+  };
+
+  // Create a function bound to a given object (assigning `this`, and arguments,
+  // optionally). Delegates to **ECMAScript 5**'s native `Function.bind` if
+  // available.
+  _.bind = function(func, context) {
+    if (nativeBind && func.bind === nativeBind) return nativeBind.apply(func, slice.call(arguments, 1));
+    if (!_.isFunction(func)) throw new TypeError('Bind must be called on a function');
+    var args = slice.call(arguments, 2);
+    var bound = function() {
+      return executeBound(func, bound, context, this, args.concat(slice.call(arguments)));
+    };
+    return bound;
+  };
+
+  // Partially apply a function by creating a version that has had some of its
+  // arguments pre-filled, without changing its dynamic `this` context. _ acts
+  // as a placeholder, allowing any combination of arguments to be pre-filled.
+  _.partial = function(func) {
+    var boundArgs = slice.call(arguments, 1);
+    var bound = function() {
+      var position = 0, length = boundArgs.length;
+      var args = Array(length);
+      for (var i = 0; i < length; i++) {
+        args[i] = boundArgs[i] === _ ? arguments[position++] : boundArgs[i];
+      }
+      while (position < arguments.length) args.push(arguments[position++]);
+      return executeBound(func, bound, this, this, args);
+    };
+    return bound;
+  };
+
+  // Bind a number of an object's methods to that object. Remaining arguments
+  // are the method names to be bound. Useful for ensuring that all callbacks
+  // defined on an object belong to it.
+  _.bindAll = function(obj) {
+    var i, length = arguments.length, key;
+    if (length <= 1) throw new Error('bindAll must be passed function names');
+    for (i = 1; i < length; i++) {
+      key = arguments[i];
+      obj[key] = _.bind(obj[key], obj);
+    }
+    return obj;
+  };
+
+  // Memoize an expensive function by storing its results.
+  _.memoize = function(func, hasher) {
+    var memoize = function(key) {
+      var cache = memoize.cache;
+      var address = '' + (hasher ? hasher.apply(this, arguments) : key);
+      if (!_.has(cache, address)) cache[address] = func.apply(this, arguments);
+      return cache[address];
+    };
+    memoize.cache = {};
+    return memoize;
+  };
+
+  // Delays a function for the given number of milliseconds, and then calls
+  // it with the arguments supplied.
+  _.delay = function(func, wait) {
+    var args = slice.call(arguments, 2);
+    return setTimeout(function(){
+      return func.apply(null, args);
+    }, wait);
+  };
+
+  // Defers a function, scheduling it to run after the current call stack has
+  // cleared.
+  _.defer = _.partial(_.delay, _, 1);
+
+  // Returns a function, that, when invoked, will only be triggered at most once
+  // during a given window of time. Normally, the throttled function will run
+  // as much as it can, without ever going more than once per `wait` duration;
+  // but if you'd like to disable the execution on the leading edge, pass
+  // `{leading: false}`. To disable execution on the trailing edge, ditto.
+  _.throttle = function(func, wait, options) {
+    var context, args, result;
+    var timeout = null;
+    var previous = 0;
+    if (!options) options = {};
+    var later = function() {
+      previous = options.leading === false ? 0 : _.now();
+      timeout = null;
+      result = func.apply(context, args);
+      if (!timeout) context = args = null;
+    };
+    return function() {
+      var now = _.now();
+      if (!previous && options.leading === false) previous = now;
+      var remaining = wait - (now - previous);
+      context = this;
+      args = arguments;
+      if (remaining <= 0 || remaining > wait) {
+        if (timeout) {
+          clearTimeout(timeout);
+          timeout = null;
+        }
+        previous = now;
+        result = func.apply(context, args);
+        if (!timeout) context = args = null;
+      } else if (!timeout && options.trailing !== false) {
+        timeout = setTimeout(later, remaining);
+      }
+      return result;
+    };
+  };
+
+  // Returns a function, that, as long as it continues to be invoked, will not
+  // be triggered. The function will be called after it stops being called for
+  // N milliseconds. If `immediate` is passed, trigger the function on the
+  // leading edge, instead of the trailing.
+  _.debounce = function(func, wait, immediate) {
+    var timeout, args, context, timestamp, result;
+
+    var later = function() {
+      var last = _.now() - timestamp;
+
+      if (last < wait && last >= 0) {
+        timeout = setTimeout(later, wait - last);
+      } else {
+        timeout = null;
+        if (!immediate) {
+          result = func.apply(context, args);
+          if (!timeout) context = args = null;
+        }
+      }
+    };
+
+    return function() {
+      context = this;
+      args = arguments;
+      timestamp = _.now();
+      var callNow = immediate && !timeout;
+      if (!timeout) timeout = setTimeout(later, wait);
+      if (callNow) {
+        result = func.apply(context, args);
+        context = args = null;
+      }
+
+      return result;
+    };
+  };
+
+  // Returns the first function passed as an argument to the second,
+  // allowing you to adjust arguments, run code before and after, and
+  // conditionally execute the original function.
+  _.wrap = function(func, wrapper) {
+    return _.partial(wrapper, func);
+  };
+
+  // Returns a negated version of the passed-in predicate.
+  _.negate = function(predicate) {
+    return function() {
+      return !predicate.apply(this, arguments);
+    };
+  };
+
+  // Returns a function that is the composition of a list of functions, each
+  // consuming the return value of the function that follows.
+  _.compose = function() {
+    var args = arguments;
+    var start = args.length - 1;
+    return function() {
+      var i = start;
+      var result = args[start].apply(this, arguments);
+      while (i--) result = args[i].call(this, result);
+      return result;
+    };
+  };
+
+  // Returns a function that will only be executed on and after the Nth call.
+  _.after = function(times, func) {
+    return function() {
+      if (--times < 1) {
+        return func.apply(this, arguments);
+      }
+    };
+  };
+
+  // Returns a function that will only be executed up to (but not including) the Nth call.
+  _.before = function(times, func) {
+    var memo;
+    return function() {
+      if (--times > 0) {
+        memo = func.apply(this, arguments);
+      }
+      if (times <= 1) func = null;
+      return memo;
+    };
+  };
+
+  // Returns a function that will be executed at most one time, no matter how
+  // often you call it. Useful for lazy initialization.
+  _.once = _.partial(_.before, 2);
+
+  // Object Functions
+  // ----------------
+
+  // Keys in IE < 9 that won't be iterated by `for key in ...` and thus missed.
+  var hasEnumBug = !{toString: null}.propertyIsEnumerable('toString');
+  var nonEnumerableProps = ['valueOf', 'isPrototypeOf', 'toString',
+                      'propertyIsEnumerable', 'hasOwnProperty', 'toLocaleString'];
+
+  function collectNonEnumProps(obj, keys) {
+    var nonEnumIdx = nonEnumerableProps.length;
+    var constructor = obj.constructor;
+    var proto = (_.isFunction(constructor) && constructor.prototype) || ObjProto;
+
+    // Constructor is a special case.
+    var prop = 'constructor';
+    if (_.has(obj, prop) && !_.contains(keys, prop)) keys.push(prop);
+
+    while (nonEnumIdx--) {
+      prop = nonEnumerableProps[nonEnumIdx];
+      if (prop in obj && obj[prop] !== proto[prop] && !_.contains(keys, prop)) {
+        keys.push(prop);
+      }
+    }
+  }
+
+  // Retrieve the names of an object's own properties.
+  // Delegates to **ECMAScript 5**'s native `Object.keys`
+  _.keys = function(obj) {
+    if (!_.isObject(obj)) return [];
+    if (nativeKeys) return nativeKeys(obj);
+    var keys = [];
+    for (var key in obj) if (_.has(obj, key)) keys.push(key);
+    // Ahem, IE < 9.
+    if (hasEnumBug) collectNonEnumProps(obj, keys);
+    return keys;
+  };
+
+  // Retrieve all the property names of an object.
+  _.allKeys = function(obj) {
+    if (!_.isObject(obj)) return [];
+    var keys = [];
+    for (var key in obj) keys.push(key);
+    // Ahem, IE < 9.
+    if (hasEnumBug) collectNonEnumProps(obj, keys);
+    return keys;
+  };
+
+  // Retrieve the values of an object's properties.
+  _.values = function(obj) {
+    var keys = _.keys(obj);
+    var length = keys.length;
+    var values = Array(length);
+    for (var i = 0; i < length; i++) {
+      values[i] = obj[keys[i]];
+    }
+    return values;
+  };
+
+  // Returns the results of applying the iteratee to each element of the object
+  // In contrast to _.map it returns an object
+  _.mapObject = function(obj, iteratee, context) {
+    iteratee = cb(iteratee, context);
+    var keys =  _.keys(obj),
+          length = keys.length,
+          results = {},
+          currentKey;
+      for (var index = 0; index < length; index++) {
+        currentKey = keys[index];
+        results[currentKey] = iteratee(obj[currentKey], currentKey, obj);
+      }
+      return results;
+  };
+
+  // Convert an object into a list of `[key, value]` pairs.
+  _.pairs = function(obj) {
+    var keys = _.keys(obj);
+    var length = keys.length;
+    var pairs = Array(length);
+    for (var i = 0; i < length; i++) {
+      pairs[i] = [keys[i], obj[keys[i]]];
+    }
+    return pairs;
+  };
+
+  // Invert the keys and values of an object. The values must be serializable.
+  _.invert = function(obj) {
+    var result = {};
+    var keys = _.keys(obj);
+    for (var i = 0, length = keys.length; i < length; i++) {
+      result[obj[keys[i]]] = keys[i];
+    }
+    return result;
+  };
+
+  // Return a sorted list of the function names available on the object.
+  // Aliased as `methods`
+  _.functions = _.methods = function(obj) {
+    var names = [];
+    for (var key in obj) {
+      if (_.isFunction(obj[key])) names.push(key);
+    }
+    return names.sort();
+  };
+
+  // Extend a given object with all the properties in passed-in object(s).
+  _.extend = createAssigner(_.allKeys);
+
+  // Assigns a given object with all the own properties in the passed-in object(s)
+  // (https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Object/assign)
+  _.extendOwn = _.assign = createAssigner(_.keys);
+
+  // Returns the first key on an object that passes a predicate test
+  _.findKey = function(obj, predicate, context) {
+    predicate = cb(predicate, context);
+    var keys = _.keys(obj), key;
+    for (var i = 0, length = keys.length; i < length; i++) {
+      key = keys[i];
+      if (predicate(obj[key], key, obj)) return key;
+    }
+  };
+
+  // Return a copy of the object only containing the whitelisted properties.
+  _.pick = function(object, oiteratee, context) {
+    var result = {}, obj = object, iteratee, keys;
+    if (obj == null) return result;
+    if (_.isFunction(oiteratee)) {
+      keys = _.allKeys(obj);
+      iteratee = optimizeCb(oiteratee, context);
+    } else {
+      keys = flatten(arguments, false, false, 1);
+      iteratee = function(value, key, obj) { return key in obj; };
+      obj = Object(obj);
+    }
+    for (var i = 0, length = keys.length; i < length; i++) {
+      var key = keys[i];
+      var value = obj[key];
+      if (iteratee(value, key, obj)) result[key] = value;
+    }
+    return result;
+  };
+
+   // Return a copy of the object without the blacklisted properties.
+  _.omit = function(obj, iteratee, context) {
+    if (_.isFunction(iteratee)) {
+      iteratee = _.negate(iteratee);
+    } else {
+      var keys = _.map(flatten(arguments, false, false, 1), String);
+      iteratee = function(value, key) {
+        return !_.contains(keys, key);
+      };
+    }
+    return _.pick(obj, iteratee, context);
+  };
+
+  // Fill in a given object with default properties.
+  _.defaults = createAssigner(_.allKeys, true);
+
+  // Creates an object that inherits from the given prototype object.
+  // If additional properties are provided then they will be added to the
+  // created object.
+  _.create = function(prototype, props) {
+    var result = baseCreate(prototype);
+    if (props) _.extendOwn(result, props);
+    return result;
+  };
+
+  // Create a (shallow-cloned) duplicate of an object.
+  _.clone = function(obj) {
+    if (!_.isObject(obj)) return obj;
+    return _.isArray(obj) ? obj.slice() : _.extend({}, obj);
+  };
+
+  // Invokes interceptor with the obj, and then returns obj.
+  // The primary purpose of this method is to "tap into" a method chain, in
+  // order to perform operations on intermediate results within the chain.
+  _.tap = function(obj, interceptor) {
+    interceptor(obj);
+    return obj;
+  };
+
+  // Returns whether an object has a given set of `key:value` pairs.
+  _.isMatch = function(object, attrs) {
+    var keys = _.keys(attrs), length = keys.length;
+    if (object == null) return !length;
+    var obj = Object(object);
+    for (var i = 0; i < length; i++) {
+      var key = keys[i];
+      if (attrs[key] !== obj[key] || !(key in obj)) return false;
+    }
+    return true;
+  };
+
+
+  // Internal recursive comparison function for `isEqual`.
+  var eq = function(a, b, aStack, bStack) {
+    // Identical objects are equal. `0 === -0`, but they aren't identical.
+    // See the [Harmony `egal` proposal](http://wiki.ecmascript.org/doku.php?id=harmony:egal).
+    if (a === b) return a !== 0 || 1 / a === 1 / b;
+    // A strict comparison is necessary because `null == undefined`.
+    if (a == null || b == null) return a === b;
+    // Unwrap any wrapped objects.
+    if (a instanceof _) a = a._wrapped;
+    if (b instanceof _) b = b._wrapped;
+    // Compare `[[Class]]` names.
+    var className = toString.call(a);
+    if (className !== toString.call(b)) return false;
+    switch (className) {
+      // Strings, numbers, regular expressions, dates, and booleans are compared by value.
+      case '[object RegExp]':
+      // RegExps are coerced to strings for comparison (Note: '' + /a/i === '/a/i')
+      case '[object String]':
+        // Primitives and their corresponding object wrappers are equivalent; thus, `"5"` is
+        // equivalent to `new String("5")`.
+        return '' + a === '' + b;
+      case '[object Number]':
+        // `NaN`s are equivalent, but non-reflexive.
+        // Object(NaN) is equivalent to NaN
+        if (+a !== +a) return +b !== +b;
+        // An `egal` comparison is performed for other numeric values.
+        return +a === 0 ? 1 / +a === 1 / b : +a === +b;
+      case '[object Date]':
+      case '[object Boolean]':
+        // Coerce dates and booleans to numeric primitive values. Dates are compared by their
+        // millisecond representations. Note that invalid dates with millisecond representations
+        // of `NaN` are not equivalent.
+        return +a === +b;
+    }
+
+    var areArrays = className === '[object Array]';
+    if (!areArrays) {
+      if (typeof a != 'object' || typeof b != 'object') return false;
+
+      // Objects with different constructors are not equivalent, but `Object`s or `Array`s
+      // from different frames are.
+      var aCtor = a.constructor, bCtor = b.constructor;
+      if (aCtor !== bCtor && !(_.isFunction(aCtor) && aCtor instanceof aCtor &&
+                               _.isFunction(bCtor) && bCtor instanceof bCtor)
+                          && ('constructor' in a && 'constructor' in b)) {
+        return false;
+      }
+    }
+    // Assume equality for cyclic structures. The algorithm for detecting cyclic
+    // structures is adapted from ES 5.1 section 15.12.3, abstract operation `JO`.
+
+    // Initializing stack of traversed objects.
+    // It's done here since we only need them for objects and arrays comparison.
+    aStack = aStack || [];
+    bStack = bStack || [];
+    var length = aStack.length;
+    while (length--) {
+      // Linear search. Performance is inversely proportional to the number of
+      // unique nested structures.
+      if (aStack[length] === a) return bStack[length] === b;
+    }
+
+    // Add the first object to the stack of traversed objects.
+    aStack.push(a);
+    bStack.push(b);
+
+    // Recursively compare objects and arrays.
+    if (areArrays) {
+      // Compare array lengths to determine if a deep comparison is necessary.
+      length = a.length;
+      if (length !== b.length) return false;
+      // Deep compare the contents, ignoring non-numeric properties.
+      while (length--) {
+        if (!eq(a[length], b[length], aStack, bStack)) return false;
+      }
+    } else {
+      // Deep compare objects.
+      var keys = _.keys(a), key;
+      length = keys.length;
+      // Ensure that both objects contain the same number of properties before comparing deep equality.
+      if (_.keys(b).length !== length) return false;
+      while (length--) {
+        // Deep compare each member
+        key = keys[length];
+        if (!(_.has(b, key) && eq(a[key], b[key], aStack, bStack))) return false;
+      }
+    }
+    // Remove the first object from the stack of traversed objects.
+    aStack.pop();
+    bStack.pop();
+    return true;
+  };
+
+  // Perform a deep comparison to check if two objects are equal.
+  _.isEqual = function(a, b) {
+    return eq(a, b);
+  };
+
+  // Is a given array, string, or object empty?
+  // An "empty" object has no enumerable own-properties.
+  _.isEmpty = function(obj) {
+    if (obj == null) return true;
+    if (isArrayLike(obj) && (_.isArray(obj) || _.isString(obj) || _.isArguments(obj))) return obj.length === 0;
+    return _.keys(obj).length === 0;
+  };
+
+  // Is a given value a DOM element?
+  _.isElement = function(obj) {
+    return !!(obj && obj.nodeType === 1);
+  };
+
+  // Is a given value an array?
+  // Delegates to ECMA5's native Array.isArray
+  _.isArray = nativeIsArray || function(obj) {
+    return toString.call(obj) === '[object Array]';
+  };
+
+  // Is a given variable an object?
+  _.isObject = function(obj) {
+    var type = typeof obj;
+    return type === 'function' || type === 'object' && !!obj;
+  };
+
+  // Add some isType methods: isArguments, isFunction, isString, isNumber, isDate, isRegExp, isError.
+  _.each(['Arguments', 'Function', 'String', 'Number', 'Date', 'RegExp', 'Error'], function(name) {
+    _['is' + name] = function(obj) {
+      return toString.call(obj) === '[object ' + name + ']';
+    };
+  });
+
+  // Define a fallback version of the method in browsers (ahem, IE < 9), where
+  // there isn't any inspectable "Arguments" type.
+  if (!_.isArguments(arguments)) {
+    _.isArguments = function(obj) {
+      return _.has(obj, 'callee');
+    };
+  }
+
+  // Optimize `isFunction` if appropriate. Work around some typeof bugs in old v8,
+  // IE 11 (#1621), and in Safari 8 (#1929).
+  if (typeof /./ != 'function' && typeof Int8Array != 'object') {
+    _.isFunction = function(obj) {
+      return typeof obj == 'function' || false;
+    };
+  }
+
+  // Is a given object a finite number?
+  _.isFinite = function(obj) {
+    return isFinite(obj) && !isNaN(parseFloat(obj));
+  };
+
+  // Is the given value `NaN`? (NaN is the only number which does not equal itself).
+  _.isNaN = function(obj) {
+    return _.isNumber(obj) && obj !== +obj;
+  };
+
+  // Is a given value a boolean?
+  _.isBoolean = function(obj) {
+    return obj === true || obj === false || toString.call(obj) === '[object Boolean]';
+  };
+
+  // Is a given value equal to null?
+  _.isNull = function(obj) {
+    return obj === null;
+  };
+
+  // Is a given variable undefined?
+  _.isUndefined = function(obj) {
+    return obj === void 0;
+  };
+
+  // Shortcut function for checking if an object has a given property directly
+  // on itself (in other words, not on a prototype).
+  _.has = function(obj, key) {
+    return obj != null && hasOwnProperty.call(obj, key);
+  };
+
+  // Utility Functions
+  // -----------------
+
+  // Run Underscore.js in *noConflict* mode, returning the `_` variable to its
+  // previous owner. Returns a reference to the Underscore object.
+  _.noConflict = function() {
+    root._ = previousUnderscore;
+    return this;
+  };
+
+  // Keep the identity function around for default iteratees.
+  _.identity = function(value) {
+    return value;
+  };
+
+  // Predicate-generating functions. Often useful outside of Underscore.
+  _.constant = function(value) {
+    return function() {
+      return value;
+    };
+  };
+
+  _.noop = function(){};
+
+  _.property = property;
+
+  // Generates a function for a given object that returns a given property.
+  _.propertyOf = function(obj) {
+    return obj == null ? function(){} : function(key) {
+      return obj[key];
+    };
+  };
+
+  // Returns a predicate for checking whether an object has a given set of
+  // `key:value` pairs.
+  _.matcher = _.matches = function(attrs) {
+    attrs = _.extendOwn({}, attrs);
+    return function(obj) {
+      return _.isMatch(obj, attrs);
+    };
+  };
+
+  // Run a function **n** times.
+  _.times = function(n, iteratee, context) {
+    var accum = Array(Math.max(0, n));
+    iteratee = optimizeCb(iteratee, context, 1);
+    for (var i = 0; i < n; i++) accum[i] = iteratee(i);
+    return accum;
+  };
+
+  // Return a random integer between min and max (inclusive).
+  _.random = function(min, max) {
+    if (max == null) {
+      max = min;
+      min = 0;
+    }
+    return min + Math.floor(Math.random() * (max - min + 1));
+  };
+
+  // A (possibly faster) way to get the current timestamp as an integer.
+  _.now = Date.now || function() {
+    return new Date().getTime();
+  };
+
+   // List of HTML entities for escaping.
+  var escapeMap = {
+    '&': '&amp;',
+    '<': '&lt;',
+    '>': '&gt;',
+    '"': '&quot;',
+    "'": '&#x27;',
+    '`': '&#x60;'
+  };
+  var unescapeMap = _.invert(escapeMap);
+
+  // Functions for escaping and unescaping strings to/from HTML interpolation.
+  var createEscaper = function(map) {
+    var escaper = function(match) {
+      return map[match];
+    };
+    // Regexes for identifying a key that needs to be escaped
+    var source = '(?:' + _.keys(map).join('|') + ')';
+    var testRegexp = RegExp(source);
+    var replaceRegexp = RegExp(source, 'g');
+    return function(string) {
+      string = string == null ? '' : '' + string;
+      return testRegexp.test(string) ? string.replace(replaceRegexp, escaper) : string;
+    };
+  };
+  _.escape = createEscaper(escapeMap);
+  _.unescape = createEscaper(unescapeMap);
+
+  // If the value of the named `property` is a function then invoke it with the
+  // `object` as context; otherwise, return it.
+  _.result = function(object, property, fallback) {
+    var value = object == null ? void 0 : object[property];
+    if (value === void 0) {
+      value = fallback;
+    }
+    return _.isFunction(value) ? value.call(object) : value;
+  };
+
+  // Generate a unique integer id (unique within the entire client session).
+  // Useful for temporary DOM ids.
+  var idCounter = 0;
+  _.uniqueId = function(prefix) {
+    var id = ++idCounter + '';
+    return prefix ? prefix + id : id;
+  };
+
+  // By default, Underscore uses ERB-style template delimiters, change the
+  // following template settings to use alternative delimiters.
+  _.templateSettings = {
+    evaluate    : /<%([\s\S]+?)%>/g,
+    interpolate : /<%=([\s\S]+?)%>/g,
+    escape      : /<%-([\s\S]+?)%>/g
+  };
+
+  // When customizing `templateSettings`, if you don't want to define an
+  // interpolation, evaluation or escaping regex, we need one that is
+  // guaranteed not to match.
+  var noMatch = /(.)^/;
+
+  // Certain characters need to be escaped so that they can be put into a
+  // string literal.
+  var escapes = {
+    "'":      "'",
+    '\\':     '\\',
+    '\r':     'r',
+    '\n':     'n',
+    '\u2028': 'u2028',
+    '\u2029': 'u2029'
+  };
+
+  var escaper = /\\|'|\r|\n|\u2028|\u2029/g;
+
+  var escapeChar = function(match) {
+    return '\\' + escapes[match];
+  };
+
+  // JavaScript micro-templating, similar to John Resig's implementation.
+  // Underscore templating handles arbitrary delimiters, preserves whitespace,
+  // and correctly escapes quotes within interpolated code.
+  // NB: `oldSettings` only exists for backwards compatibility.
+  _.template = function(text, settings, oldSettings) {
+    if (!settings && oldSettings) settings = oldSettings;
+    settings = _.defaults({}, settings, _.templateSettings);
+
+    // Combine delimiters into one regular expression via alternation.
+    var matcher = RegExp([
+      (settings.escape || noMatch).source,
+      (settings.interpolate || noMatch).source,
+      (settings.evaluate || noMatch).source
+    ].join('|') + '|$', 'g');
+
+    // Compile the template source, escaping string literals appropriately.
+    var index = 0;
+    var source = "__p+='";
+    text.replace(matcher, function(match, escape, interpolate, evaluate, offset) {
+      source += text.slice(index, offset).replace(escaper, escapeChar);
+      index = offset + match.length;
+
+      if (escape) {
+        source += "'+\n((__t=(" + escape + "))==null?'':_.escape(__t))+\n'";
+      } else if (interpolate) {
+        source += "'+\n((__t=(" + interpolate + "))==null?'':__t)+\n'";
+      } else if (evaluate) {
+        source += "';\n" + evaluate + "\n__p+='";
+      }
+
+      // Adobe VMs need the match returned to produce the correct offest.
+      return match;
+    });
+    source += "';\n";
+
+    // If a variable is not specified, place data values in local scope.
+    if (!settings.variable) source = 'with(obj||{}){\n' + source + '}\n';
+
+    source = "var __t,__p='',__j=Array.prototype.join," +
+      "print=function(){__p+=__j.call(arguments,'');};\n" +
+      source + 'return __p;\n';
+
+    try {
+      var render = new Function(settings.variable || 'obj', '_', source);
+    } catch (e) {
+      e.source = source;
+      throw e;
+    }
+
+    var template = function(data) {
+      return render.call(this, data, _);
+    };
+
+    // Provide the compiled source as a convenience for precompilation.
+    var argument = settings.variable || 'obj';
+    template.source = 'function(' + argument + '){\n' + source + '}';
+
+    return template;
+  };
+
+  // Add a "chain" function. Start chaining a wrapped Underscore object.
+  _.chain = function(obj) {
+    var instance = _(obj);
+    instance._chain = true;
+    return instance;
+  };
+
+  // OOP
+  // ---------------
+  // If Underscore is called as a function, it returns a wrapped object that
+  // can be used OO-style. This wrapper holds altered versions of all the
+  // underscore functions. Wrapped objects may be chained.
+
+  // Helper function to continue chaining intermediate results.
+  var result = function(instance, obj) {
+    return instance._chain ? _(obj).chain() : obj;
+  };
+
+  // Add your own custom functions to the Underscore object.
+  _.mixin = function(obj) {
+    _.each(_.functions(obj), function(name) {
+      var func = _[name] = obj[name];
+      _.prototype[name] = function() {
+        var args = [this._wrapped];
+        push.apply(args, arguments);
+        return result(this, func.apply(_, args));
+      };
+    });
+  };
+
+  // Add all of the Underscore functions to the wrapper object.
+  _.mixin(_);
+
+  // Add all mutator Array functions to the wrapper.
+  _.each(['pop', 'push', 'reverse', 'shift', 'sort', 'splice', 'unshift'], function(name) {
+    var method = ArrayProto[name];
+    _.prototype[name] = function() {
+      var obj = this._wrapped;
+      method.apply(obj, arguments);
+      if ((name === 'shift' || name === 'splice') && obj.length === 0) delete obj[0];
+      return result(this, obj);
+    };
+  });
+
+  // Add all accessor Array functions to the wrapper.
+  _.each(['concat', 'join', 'slice'], function(name) {
+    var method = ArrayProto[name];
+    _.prototype[name] = function() {
+      return result(this, method.apply(this._wrapped, arguments));
+    };
+  });
+
+  // Extracts the result from a wrapped and chained object.
+  _.prototype.value = function() {
+    return this._wrapped;
+  };
+
+  // Provide unwrapping proxy for some methods used in engine operations
+  // such as arithmetic and JSON stringification.
+  _.prototype.valueOf = _.prototype.toJSON = _.prototype.value;
+
+  _.prototype.toString = function() {
+    return '' + this._wrapped;
+  };
+
+  // AMD registration happens at the end for compatibility with AMD loaders
+  // that may not enforce next-turn semantics on modules. Even though general
+  // practice for AMD registration is to be anonymous, underscore registers
+  // as a named module because, like jQuery, it is a base library that is
+  // popular enough to be bundled in a third party lib, but not be part of
+  // an AMD load request. Those cases could generate an error when an
+  // anonymous define() is called outside of a loader request.
+  if (typeof define === 'function' && define.amd) {
+    define('underscore', [], function() {
+      return _;
+    });
+  }
+}.call(this));
+
+},{}],26:[function(require,module,exports){
+arguments[4][19][0].apply(exports,arguments)
+},{"dup":19}],27:[function(require,module,exports){
+module.exports = function isBuffer(arg) {
+  return arg && typeof arg === 'object'
+    && typeof arg.copy === 'function'
+    && typeof arg.fill === 'function'
+    && typeof arg.readUInt8 === 'function';
+}
+},{}],28:[function(require,module,exports){
+(function (process,global){
+// Copyright Joyent, Inc. and other Node contributors.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to permit
+// persons to whom the Software is furnished to do so, subject to the
+// following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
+// NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+// USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+var formatRegExp = /%[sdj%]/g;
+exports.format = function(f) {
+  if (!isString(f)) {
+    var objects = [];
+    for (var i = 0; i < arguments.length; i++) {
+      objects.push(inspect(arguments[i]));
+    }
+    return objects.join(' ');
+  }
+
+  var i = 1;
+  var args = arguments;
+  var len = args.length;
+  var str = String(f).replace(formatRegExp, function(x) {
+    if (x === '%%') return '%';
+    if (i >= len) return x;
+    switch (x) {
+      case '%s': return String(args[i++]);
+      case '%d': return Number(args[i++]);
+      case '%j':
+        try {
+          return JSON.stringify(args[i++]);
+        } catch (_) {
+          return '[Circular]';
+        }
+      default:
+        return x;
+    }
+  });
+  for (var x = args[i]; i < len; x = args[++i]) {
+    if (isNull(x) || !isObject(x)) {
+      str += ' ' + x;
+    } else {
+      str += ' ' + inspect(x);
+    }
+  }
+  return str;
+};
+
+
+// Mark that a method should not be used.
+// Returns a modified function which warns once by default.
+// If --no-deprecation is set, then it is a no-op.
+exports.deprecate = function(fn, msg) {
+  // Allow for deprecating things in the process of starting up.
+  if (isUndefined(global.process)) {
+    return function() {
+      return exports.deprecate(fn, msg).apply(this, arguments);
+    };
+  }
+
+  if (process.noDeprecation === true) {
+    return fn;
+  }
+
+  var warned = false;
+  function deprecated() {
+    if (!warned) {
+      if (process.throwDeprecation) {
+        throw new Error(msg);
+      } else if (process.traceDeprecation) {
+        console.trace(msg);
+      } else {
+        console.error(msg);
+      }
+      warned = true;
+    }
+    return fn.apply(this, arguments);
+  }
+
+  return deprecated;
+};
+
+
+var debugs = {};
+var debugEnviron;
+exports.debuglog = function(set) {
+  if (isUndefined(debugEnviron))
+    debugEnviron = process.env.NODE_DEBUG || '';
+  set = set.toUpperCase();
+  if (!debugs[set]) {
+    if (new RegExp('\\b' + set + '\\b', 'i').test(debugEnviron)) {
+      var pid = process.pid;
+      debugs[set] = function() {
+        var msg = exports.format.apply(exports, arguments);
+        console.error('%s %d: %s', set, pid, msg);
+      };
+    } else {
+      debugs[set] = function() {};
+    }
+  }
+  return debugs[set];
+};
+
+
+/**
+ * Echos the value of a value. Trys to print the value out
+ * in the best way possible given the different types.
+ *
+ * @param {Object} obj The object to print out.
+ * @param {Object} opts Optional options object that alters the output.
+ */
+/* legacy: obj, showHidden, depth, colors*/
+function inspect(obj, opts) {
+  // default options
+  var ctx = {
+    seen: [],
+    stylize: stylizeNoColor
+  };
+  // legacy...
+  if (arguments.length >= 3) ctx.depth = arguments[2];
+  if (arguments.length >= 4) ctx.colors = arguments[3];
+  if (isBoolean(opts)) {
+    // legacy...
+    ctx.showHidden = opts;
+  } else if (opts) {
+    // got an "options" object
+    exports._extend(ctx, opts);
+  }
+  // set default options
+  if (isUndefined(ctx.showHidden)) ctx.showHidden = false;
+  if (isUndefined(ctx.depth)) ctx.depth = 2;
+  if (isUndefined(ctx.colors)) ctx.colors = false;
+  if (isUndefined(ctx.customInspect)) ctx.customInspect = true;
+  if (ctx.colors) ctx.stylize = stylizeWithColor;
+  return formatValue(ctx, obj, ctx.depth);
+}
+exports.inspect = inspect;
+
+
+// http://en.wikipedia.org/wiki/ANSI_escape_code#graphics
+inspect.colors = {
+  'bold' : [1, 22],
+  'italic' : [3, 23],
+  'underline' : [4, 24],
+  'inverse' : [7, 27],
+  'white' : [37, 39],
+  'grey' : [90, 39],
+  'black' : [30, 39],
+  'blue' : [34, 39],
+  'cyan' : [36, 39],
+  'green' : [32, 39],
+  'magenta' : [35, 39],
+  'red' : [31, 39],
+  'yellow' : [33, 39]
+};
+
+// Don't use 'blue' not visible on cmd.exe
+inspect.styles = {
+  'special': 'cyan',
+  'number': 'yellow',
+  'boolean': 'yellow',
+  'undefined': 'grey',
+  'null': 'bold',
+  'string': 'green',
+  'date': 'magenta',
+  // "name": intentionally not styling
+  'regexp': 'red'
+};
+
+
+function stylizeWithColor(str, styleType) {
+  var style = inspect.styles[styleType];
+
+  if (style) {
+    return '\u001b[' + inspect.colors[style][0] + 'm' + str +
+           '\u001b[' + inspect.colors[style][1] + 'm';
+  } else {
+    return str;
+  }
+}
+
+
+function stylizeNoColor(str, styleType) {
+  return str;
+}
+
+
+function arrayToHash(array) {
+  var hash = {};
+
+  array.forEach(function(val, idx) {
+    hash[val] = true;
+  });
+
+  return hash;
+}
+
+
+function formatValue(ctx, value, recurseTimes) {
+  // Provide a hook for user-specified inspect functions.
+  // Check that value is an object with an inspect function on it
+  if (ctx.customInspect &&
+      value &&
+      isFunction(value.inspect) &&
+      // Filter out the util module, it's inspect function is special
+      value.inspect !== exports.inspect &&
+      // Also filter out any prototype objects using the circular check.
+      !(value.constructor && value.constructor.prototype === value)) {
+    var ret = value.inspect(recurseTimes, ctx);
+    if (!isString(ret)) {
+      ret = formatValue(ctx, ret, recurseTimes);
+    }
+    return ret;
+  }
+
+  // Primitive types cannot have properties
+  var primitive = formatPrimitive(ctx, value);
+  if (primitive) {
+    return primitive;
+  }
+
+  // Look up the keys of the object.
+  var keys = Object.keys(value);
+  var visibleKeys = arrayToHash(keys);
+
+  if (ctx.showHidden) {
+    keys = Object.getOwnPropertyNames(value);
+  }
+
+  // IE doesn't make error fields non-enumerable
+  // http://msdn.microsoft.com/en-us/library/ie/dww52sbt(v=vs.94).aspx
+  if (isError(value)
+      && (keys.indexOf('message') >= 0 || keys.indexOf('description') >= 0)) {
+    return formatError(value);
+  }
+
+  // Some type of object without properties can be shortcutted.
+  if (keys.length === 0) {
+    if (isFunction(value)) {
+      var name = value.name ? ': ' + value.name : '';
+      return ctx.stylize('[Function' + name + ']', 'special');
+    }
+    if (isRegExp(value)) {
+      return ctx.stylize(RegExp.prototype.toString.call(value), 'regexp');
+    }
+    if (isDate(value)) {
+      return ctx.stylize(Date.prototype.toString.call(value), 'date');
+    }
+    if (isError(value)) {
+      return formatError(value);
+    }
+  }
+
+  var base = '', array = false, braces = ['{', '}'];
+
+  // Make Array say that they are Array
+  if (isArray(value)) {
+    array = true;
+    braces = ['[', ']'];
+  }
+
+  // Make functions say that they are functions
+  if (isFunction(value)) {
+    var n = value.name ? ': ' + value.name : '';
+    base = ' [Function' + n + ']';
+  }
+
+  // Make RegExps say that they are RegExps
+  if (isRegExp(value)) {
+    base = ' ' + RegExp.prototype.toString.call(value);
+  }
+
+  // Make dates with properties first say the date
+  if (isDate(value)) {
+    base = ' ' + Date.prototype.toUTCString.call(value);
+  }
+
+  // Make error with message first say the error
+  if (isError(value)) {
+    base = ' ' + formatError(value);
+  }
+
+  if (keys.length === 0 && (!array || value.length == 0)) {
+    return braces[0] + base + braces[1];
+  }
+
+  if (recurseTimes < 0) {
+    if (isRegExp(value)) {
+      return ctx.stylize(RegExp.prototype.toString.call(value), 'regexp');
+    } else {
+      return ctx.stylize('[Object]', 'special');
+    }
+  }
+
+  ctx.seen.push(value);
+
+  var output;
+  if (array) {
+    output = formatArray(ctx, value, recurseTimes, visibleKeys, keys);
+  } else {
+    output = keys.map(function(key) {
+      return formatProperty(ctx, value, recurseTimes, visibleKeys, key, array);
+    });
+  }
+
+  ctx.seen.pop();
+
+  return reduceToSingleString(output, base, braces);
+}
+
+
+function formatPrimitive(ctx, value) {
+  if (isUndefined(value))
+    return ctx.stylize('undefined', 'undefined');
+  if (isString(value)) {
+    var simple = '\'' + JSON.stringify(value).replace(/^"|"$/g, '')
+                                             .replace(/'/g, "\\'")
+                                             .replace(/\\"/g, '"') + '\'';
+    return ctx.stylize(simple, 'string');
+  }
+  if (isNumber(value))
+    return ctx.stylize('' + value, 'number');
+  if (isBoolean(value))
+    return ctx.stylize('' + value, 'boolean');
+  // For some reason typeof null is "object", so special case here.
+  if (isNull(value))
+    return ctx.stylize('null', 'null');
+}
+
+
+function formatError(value) {
+  return '[' + Error.prototype.toString.call(value) + ']';
+}
+
+
+function formatArray(ctx, value, recurseTimes, visibleKeys, keys) {
+  var output = [];
+  for (var i = 0, l = value.length; i < l; ++i) {
+    if (hasOwnProperty(value, String(i))) {
+      output.push(formatProperty(ctx, value, recurseTimes, visibleKeys,
+          String(i), true));
+    } else {
+      output.push('');
+    }
+  }
+  keys.forEach(function(key) {
+    if (!key.match(/^\d+$/)) {
+      output.push(formatProperty(ctx, value, recurseTimes, visibleKeys,
+          key, true));
+    }
+  });
+  return output;
+}
+
+
+function formatProperty(ctx, value, recurseTimes, visibleKeys, key, array) {
+  var name, str, desc;
+  desc = Object.getOwnPropertyDescriptor(value, key) || { value: value[key] };
+  if (desc.get) {
+    if (desc.set) {
+      str = ctx.stylize('[Getter/Setter]', 'special');
+    } else {
+      str = ctx.stylize('[Getter]', 'special');
+    }
+  } else {
+    if (desc.set) {
+      str = ctx.stylize('[Setter]', 'special');
+    }
+  }
+  if (!hasOwnProperty(visibleKeys, key)) {
+    name = '[' + key + ']';
+  }
+  if (!str) {
+    if (ctx.seen.indexOf(desc.value) < 0) {
+      if (isNull(recurseTimes)) {
+        str = formatValue(ctx, desc.value, null);
+      } else {
+        str = formatValue(ctx, desc.value, recurseTimes - 1);
+      }
+      if (str.indexOf('\n') > -1) {
+        if (array) {
+          str = str.split('\n').map(function(line) {
+            return '  ' + line;
+          }).join('\n').substr(2);
+        } else {
+          str = '\n' + str.split('\n').map(function(line) {
+            return '   ' + line;
+          }).join('\n');
+        }
+      }
+    } else {
+      str = ctx.stylize('[Circular]', 'special');
+    }
+  }
+  if (isUndefined(name)) {
+    if (array && key.match(/^\d+$/)) {
+      return str;
+    }
+    name = JSON.stringify('' + key);
+    if (name.match(/^"([a-zA-Z_][a-zA-Z_0-9]*)"$/)) {
+      name = name.substr(1, name.length - 2);
+      name = ctx.stylize(name, 'name');
+    } else {
+      name = name.replace(/'/g, "\\'")
+                 .replace(/\\"/g, '"')
+                 .replace(/(^"|"$)/g, "'");
+      name = ctx.stylize(name, 'string');
+    }
+  }
+
+  return name + ': ' + str;
+}
+
+
+function reduceToSingleString(output, base, braces) {
+  var numLinesEst = 0;
+  var length = output.reduce(function(prev, cur) {
+    numLinesEst++;
+    if (cur.indexOf('\n') >= 0) numLinesEst++;
+    return prev + cur.replace(/\u001b\[\d\d?m/g, '').length + 1;
+  }, 0);
+
+  if (length > 60) {
+    return braces[0] +
+           (base === '' ? '' : base + '\n ') +
+           ' ' +
+           output.join(',\n  ') +
+           ' ' +
+           braces[1];
+  }
+
+  return braces[0] + base + ' ' + output.join(', ') + ' ' + braces[1];
+}
+
+
+// NOTE: These type checking functions intentionally don't use `instanceof`
+// because it is fragile and can be easily faked with `Object.create()`.
+function isArray(ar) {
+  return Array.isArray(ar);
+}
+exports.isArray = isArray;
+
+function isBoolean(arg) {
+  return typeof arg === 'boolean';
+}
+exports.isBoolean = isBoolean;
+
+function isNull(arg) {
+  return arg === null;
+}
+exports.isNull = isNull;
+
+function isNullOrUndefined(arg) {
+  return arg == null;
+}
+exports.isNullOrUndefined = isNullOrUndefined;
+
+function isNumber(arg) {
+  return typeof arg === 'number';
+}
+exports.isNumber = isNumber;
+
+function isString(arg) {
+  return typeof arg === 'string';
+}
+exports.isString = isString;
+
+function isSymbol(arg) {
+  return typeof arg === 'symbol';
+}
+exports.isSymbol = isSymbol;
+
+function isUndefined(arg) {
+  return arg === void 0;
+}
+exports.isUndefined = isUndefined;
+
+function isRegExp(re) {
+  return isObject(re) && objectToString(re) === '[object RegExp]';
+}
+exports.isRegExp = isRegExp;
+
+function isObject(arg) {
+  return typeof arg === 'object' && arg !== null;
+}
+exports.isObject = isObject;
+
+function isDate(d) {
+  return isObject(d) && objectToString(d) === '[object Date]';
+}
+exports.isDate = isDate;
+
+function isError(e) {
+  return isObject(e) &&
+      (objectToString(e) === '[object Error]' || e instanceof Error);
+}
+exports.isError = isError;
+
+function isFunction(arg) {
+  return typeof arg === 'function';
+}
+exports.isFunction = isFunction;
+
+function isPrimitive(arg) {
+  return arg === null ||
+         typeof arg === 'boolean' ||
+         typeof arg === 'number' ||
+         typeof arg === 'string' ||
+         typeof arg === 'symbol' ||  // ES6 symbol
+         typeof arg === 'undefined';
+}
+exports.isPrimitive = isPrimitive;
+
+exports.isBuffer = require('./support/isBuffer');
+
+function objectToString(o) {
+  return Object.prototype.toString.call(o);
+}
+
+
+function pad(n) {
+  return n < 10 ? '0' + n.toString(10) : n.toString(10);
+}
+
+
+var months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',
+              'Oct', 'Nov', 'Dec'];
+
+// 26 Feb 16:19:34
+function timestamp() {
+  var d = new Date();
+  var time = [pad(d.getHours()),
+              pad(d.getMinutes()),
+              pad(d.getSeconds())].join(':');
+  return [d.getDate(), months[d.getMonth()], time].join(' ');
+}
+
+
+// log is just a thin wrapper to console.log that prepends a timestamp
+exports.log = function() {
+  console.log('%s - %s', timestamp(), exports.format.apply(exports, arguments));
+};
+
+
+/**
+ * Inherit the prototype methods from one constructor into another.
+ *
+ * The Function.prototype.inherits from lang.js rewritten as a standalone
+ * function (not on Function.prototype). NOTE: If this file is to be loaded
+ * during bootstrapping this function needs to be rewritten using some native
+ * functions as prototype setup using normal JavaScript does not work as
+ * expected during bootstrapping (see mirror.js in r114903).
+ *
+ * @param {function} ctor Constructor function which needs to inherit the
+ *     prototype.
+ * @param {function} superCtor Constructor function to inherit prototype from.
+ */
+exports.inherits = require('inherits');
+
+exports._extend = function(origin, add) {
+  // Don't do anything if add isn't an object
+  if (!add || !isObject(add)) return origin;
+
+  var keys = Object.keys(add);
+  var i = keys.length;
+  while (i--) {
+    origin[keys[i]] = add[keys[i]];
+  }
+  return origin;
+};
+
+function hasOwnProperty(obj, prop) {
+  return Object.prototype.hasOwnProperty.call(obj, prop);
+}
+
+}).call(this,require('_process'),typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : {})
+},{"./support/isBuffer":27,"_process":24,"inherits":26}],29:[function(require,module,exports){
+// Returns a wrapper function that returns a wrapped callback
+// The wrapper function should do some stuff, and return a
+// presumably different callback function.
+// This makes sure that own properties are retained, so that
+// decorations and such are not lost along the way.
+module.exports = wrappy
+function wrappy (fn, cb) {
+  if (fn && cb) return wrappy(fn)(cb)
+
+  if (typeof fn !== 'function')
+    throw new TypeError('need wrapper function')
+
+  Object.keys(fn).forEach(function (k) {
+    wrapper[k] = fn[k]
+  })
+
+  return wrapper
+
+  function wrapper() {
+    var args = new Array(arguments.length)
+    for (var i = 0; i < args.length; i++) {
+      args[i] = arguments[i]
+    }
+    var ret = fn.apply(this, args)
+    var cb = args[args.length-1]
+    if (typeof ret === 'function' && ret !== cb) {
+      Object.keys(cb).forEach(function (k) {
+        ret[k] = cb[k]
+      })
+    }
+    return ret
+  }
+}
+
+},{}]},{},[7])(7)
+});
\ No newline at end of file
diff --git a/assets/javascripts/workers/search.b8dbb3d2.min.js b/assets/javascripts/workers/search.b8dbb3d2.min.js
new file mode 100644
index 0000000..c8a1ec8
--- /dev/null
+++ b/assets/javascripts/workers/search.b8dbb3d2.min.js
@@ -0,0 +1,42 @@
+"use strict";(()=>{var xe=Object.create;var U=Object.defineProperty,ve=Object.defineProperties,Se=Object.getOwnPropertyDescriptor,Te=Object.getOwnPropertyDescriptors,Qe=Object.getOwnPropertyNames,Y=Object.getOwnPropertySymbols,Ee=Object.getPrototypeOf,X=Object.prototype.hasOwnProperty,be=Object.prototype.propertyIsEnumerable;var Z=Math.pow,J=(t,e,r)=>e in t?U(t,e,{enumerable:!0,configurable:!0,writable:!0,value:r}):t[e]=r,A=(t,e)=>{for(var r in e||(e={}))X.call(e,r)&&J(t,r,e[r]);if(Y)for(var r of Y(e))be.call(e,r)&&J(t,r,e[r]);return t},G=(t,e)=>ve(t,Te(e));var Le=(t,e)=>()=>(e||t((e={exports:{}}).exports,e),e.exports);var we=(t,e,r,n)=>{if(e&&typeof e=="object"||typeof e=="function")for(let i of Qe(e))!X.call(t,i)&&i!==r&&U(t,i,{get:()=>e[i],enumerable:!(n=Se(e,i))||n.enumerable});return t};var Pe=(t,e,r)=>(r=t!=null?xe(Ee(t)):{},we(e||!t||!t.__esModule?U(r,"default",{value:t,enumerable:!0}):r,t));var B=(t,e,r)=>new Promise((n,i)=>{var s=u=>{try{a(r.next(u))}catch(c){i(c)}},o=u=>{try{a(r.throw(u))}catch(c){i(c)}},a=u=>u.done?n(u.value):Promise.resolve(u.value).then(s,o);a((r=r.apply(t,e)).next())});var te=Le((K,ee)=>{/**
+ * lunr - http://lunrjs.com - A bit like Solr, but much smaller and not as bright - 2.3.9
+ * Copyright (C) 2020 Oliver Nightingale
+ * @license MIT
+ */(function(){var t=function(e){var r=new t.Builder;return r.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),r.searchPipeline.add(t.stemmer),e.call(r,r),r.build()};t.version="2.3.9";/*!
+ * lunr.utils
+ * Copyright (C) 2020 Oliver Nightingale
+ */t.utils={},t.utils.warn=function(e){return function(r){e.console&&console.warn&&console.warn(r)}}(this),t.utils.asString=function(e){return e==null?"":e.toString()},t.utils.clone=function(e){if(e==null)return e;for(var r=Object.create(null),n=Object.keys(e),i=0;i<n.length;i++){var s=n[i],o=e[s];if(Array.isArray(o)){r[s]=o.slice();continue}if(typeof o=="string"||typeof o=="number"||typeof o=="boolean"){r[s]=o;continue}throw new TypeError("clone is not deep and does not support nested objects")}return r},t.FieldRef=function(e,r,n){this.docRef=e,this.fieldName=r,this._stringValue=n},t.FieldRef.joiner="/",t.FieldRef.fromString=function(e){var r=e.indexOf(t.FieldRef.joiner);if(r===-1)throw"malformed field ref string";var n=e.slice(0,r),i=e.slice(r+1);return new t.FieldRef(i,n,e)},t.FieldRef.prototype.toString=function(){return this._stringValue==null&&(this._stringValue=this.fieldName+t.FieldRef.joiner+this.docRef),this._stringValue};/*!
+ * lunr.Set
+ * Copyright (C) 2020 Oliver Nightingale
+ */t.Set=function(e){if(this.elements=Object.create(null),e){this.length=e.length;for(var r=0;r<this.length;r++)this.elements[e[r]]=!0}else this.length=0},t.Set.complete={intersect:function(e){return e},union:function(){return this},contains:function(){return!0}},t.Set.empty={intersect:function(){return this},union:function(e){return e},contains:function(){return!1}},t.Set.prototype.contains=function(e){return!!this.elements[e]},t.Set.prototype.intersect=function(e){var r,n,i,s=[];if(e===t.Set.complete)return this;if(e===t.Set.empty)return e;this.length<e.length?(r=this,n=e):(r=e,n=this),i=Object.keys(r.elements);for(var o=0;o<i.length;o++){var a=i[o];a in n.elements&&s.push(a)}return new t.Set(s)},t.Set.prototype.union=function(e){return e===t.Set.complete?t.Set.complete:e===t.Set.empty?this:new t.Set(Object.keys(this.elements).concat(Object.keys(e.elements)))},t.idf=function(e,r){var n=0;for(var i in e)i!="_index"&&(n+=Object.keys(e[i]).length);var s=(r-n+.5)/(n+.5);return Math.log(1+Math.abs(s))},t.Token=function(e,r){this.str=e||"",this.metadata=r||{}},t.Token.prototype.toString=function(){return this.str},t.Token.prototype.update=function(e){return this.str=e(this.str,this.metadata),this},t.Token.prototype.clone=function(e){return e=e||function(r){return r},new t.Token(e(this.str,this.metadata),this.metadata)};/*!
+ * lunr.tokenizer
+ * Copyright (C) 2020 Oliver Nightingale
+ */t.tokenizer=function(e,r){if(e==null||e==null)return[];if(Array.isArray(e))return e.map(function(g){return new t.Token(t.utils.asString(g).toLowerCase(),t.utils.clone(r))});for(var n=e.toString().toLowerCase(),i=n.length,s=[],o=0,a=0;o<=i;o++){var u=n.charAt(o),c=o-a;if(u.match(t.tokenizer.separator)||o==i){if(c>0){var f=t.utils.clone(r)||{};f.position=[a,c],f.index=s.length,s.push(new t.Token(n.slice(a,o),f))}a=o+1}}return s},t.tokenizer.separator=/[\s\-]+/;/*!
+ * lunr.Pipeline
+ * Copyright (C) 2020 Oliver Nightingale
+ */t.Pipeline=function(){this._stack=[]},t.Pipeline.registeredFunctions=Object.create(null),t.Pipeline.registerFunction=function(e,r){r in this.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+r),e.label=r,t.Pipeline.registeredFunctions[e.label]=e},t.Pipeline.warnIfFunctionNotRegistered=function(e){var r=e.label&&e.label in this.registeredFunctions;r||t.utils.warn(`Function is not registered with pipeline. This may cause problems when serialising the index.
+`,e)},t.Pipeline.load=function(e){var r=new t.Pipeline;return e.forEach(function(n){var i=t.Pipeline.registeredFunctions[n];if(i)r.add(i);else throw new Error("Cannot load unregistered function: "+n)}),r},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(r){t.Pipeline.warnIfFunctionNotRegistered(r),this._stack.push(r)},this)},t.Pipeline.prototype.after=function(e,r){t.Pipeline.warnIfFunctionNotRegistered(r);var n=this._stack.indexOf(e);if(n==-1)throw new Error("Cannot find existingFn");n=n+1,this._stack.splice(n,0,r)},t.Pipeline.prototype.before=function(e,r){t.Pipeline.warnIfFunctionNotRegistered(r);var n=this._stack.indexOf(e);if(n==-1)throw new Error("Cannot find existingFn");this._stack.splice(n,0,r)},t.Pipeline.prototype.remove=function(e){var r=this._stack.indexOf(e);r!=-1&&this._stack.splice(r,1)},t.Pipeline.prototype.run=function(e){for(var r=this._stack.length,n=0;n<r;n++){for(var i=this._stack[n],s=[],o=0;o<e.length;o++){var a=i(e[o],o,e);if(!(a==null||a===""))if(Array.isArray(a))for(var u=0;u<a.length;u++)s.push(a[u]);else s.push(a)}e=s}return e},t.Pipeline.prototype.runString=function(e,r){var n=new t.Token(e,r);return this.run([n]).map(function(i){return i.toString()})},t.Pipeline.prototype.reset=function(){this._stack=[]},t.Pipeline.prototype.toJSON=function(){return this._stack.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})};/*!
+ * lunr.Vector
+ * Copyright (C) 2020 Oliver Nightingale
+ */t.Vector=function(e){this._magnitude=0,this.elements=e||[]},t.Vector.prototype.positionForIndex=function(e){if(this.elements.length==0)return 0;for(var r=0,n=this.elements.length/2,i=n-r,s=Math.floor(i/2),o=this.elements[s*2];i>1&&(o<e&&(r=s),o>e&&(n=s),o!=e);)i=n-r,s=r+Math.floor(i/2),o=this.elements[s*2];if(o==e||o>e)return s*2;if(o<e)return(s+1)*2},t.Vector.prototype.insert=function(e,r){this.upsert(e,r,function(){throw"duplicate index"})},t.Vector.prototype.upsert=function(e,r,n){this._magnitude=0;var i=this.positionForIndex(e);this.elements[i]==e?this.elements[i+1]=n(this.elements[i+1],r):this.elements.splice(i,0,e,r)},t.Vector.prototype.magnitude=function(){if(this._magnitude)return this._magnitude;for(var e=0,r=this.elements.length,n=1;n<r;n+=2){var i=this.elements[n];e+=i*i}return this._magnitude=Math.sqrt(e)},t.Vector.prototype.dot=function(e){for(var r=0,n=this.elements,i=e.elements,s=n.length,o=i.length,a=0,u=0,c=0,f=0;c<s&&f<o;)a=n[c],u=i[f],a<u?c+=2:a>u?f+=2:a==u&&(r+=n[c+1]*i[f+1],c+=2,f+=2);return r},t.Vector.prototype.similarity=function(e){return this.dot(e)/this.magnitude()||0},t.Vector.prototype.toArray=function(){for(var e=new Array(this.elements.length/2),r=1,n=0;r<this.elements.length;r+=2,n++)e[n]=this.elements[r];return e},t.Vector.prototype.toJSON=function(){return this.elements};/*!
+ * lunr.stemmer
+ * Copyright (C) 2020 Oliver Nightingale
+ * Includes code from - http://tartarus.org/~martin/PorterStemmer/js.txt
+ */t.stemmer=function(){var e={ational:"ate",tional:"tion",enci:"ence",anci:"ance",izer:"ize",bli:"ble",alli:"al",entli:"ent",eli:"e",ousli:"ous",ization:"ize",ation:"ate",ator:"ate",alism:"al",iveness:"ive",fulness:"ful",ousness:"ous",aliti:"al",iviti:"ive",biliti:"ble",logi:"log"},r={icate:"ic",ative:"",alize:"al",iciti:"ic",ical:"ic",ful:"",ness:""},n="[^aeiou]",i="[aeiouy]",s=n+"[^aeiouy]*",o=i+"[aeiou]*",a="^("+s+")?"+o+s,u="^("+s+")?"+o+s+"("+o+")?$",c="^("+s+")?"+o+s+o+s,f="^("+s+")?"+i,g=new RegExp(a),l=new RegExp(c),m=new RegExp(u),x=new RegExp(f),v=/^(.+?)(ss|i)es$/,d=/^(.+?)([^s])s$/,y=/^(.+?)eed$/,b=/^(.+?)(ed|ing)$/,E=/.$/,w=/(at|bl|iz)$/,R=new RegExp("([^aeiouylsz])\\1$"),j=new RegExp("^"+s+i+"[^aeiouwxy]$"),_=/^(.+?[^aeiou])y$/,D=/^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/,N=/^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/,C=/^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/,V=/^(.+?)(s|t)(ion)$/,P=/^(.+?)e$/,z=/ll$/,$=new RegExp("^"+s+i+"[^aeiouwxy]$"),M=function(h){var S,k,L,p,T,O,F;if(h.length<3)return h;if(L=h.substr(0,1),L=="y"&&(h=L.toUpperCase()+h.substr(1)),p=v,T=d,p.test(h)?h=h.replace(p,"$1$2"):T.test(h)&&(h=h.replace(T,"$1$2")),p=y,T=b,p.test(h)){var Q=p.exec(h);p=g,p.test(Q[1])&&(p=E,h=h.replace(p,""))}else if(T.test(h)){var Q=T.exec(h);S=Q[1],T=x,T.test(S)&&(h=S,T=w,O=R,F=j,T.test(h)?h=h+"e":O.test(h)?(p=E,h=h.replace(p,"")):F.test(h)&&(h=h+"e"))}if(p=_,p.test(h)){var Q=p.exec(h);S=Q[1],h=S+"i"}if(p=D,p.test(h)){var Q=p.exec(h);S=Q[1],k=Q[2],p=g,p.test(S)&&(h=S+e[k])}if(p=N,p.test(h)){var Q=p.exec(h);S=Q[1],k=Q[2],p=g,p.test(S)&&(h=S+r[k])}if(p=C,T=V,p.test(h)){var Q=p.exec(h);S=Q[1],p=l,p.test(S)&&(h=S)}else if(T.test(h)){var Q=T.exec(h);S=Q[1]+Q[2],T=l,T.test(S)&&(h=S)}if(p=P,p.test(h)){var Q=p.exec(h);S=Q[1],p=l,T=m,O=$,(p.test(S)||T.test(S)&&!O.test(S))&&(h=S)}return p=z,T=l,p.test(h)&&T.test(h)&&(p=E,h=h.replace(p,"")),L=="y"&&(h=L.toLowerCase()+h.substr(1)),h};return function(I){return I.update(M)}}(),t.Pipeline.registerFunction(t.stemmer,"stemmer");/*!
+ * lunr.stopWordFilter
+ * Copyright (C) 2020 Oliver Nightingale
+ */t.generateStopWordFilter=function(e){var r=e.reduce(function(n,i){return n[i]=i,n},{});return function(n){if(n&&r[n.toString()]!==n.toString())return n}},t.stopWordFilter=t.generateStopWordFilter(["a","able","about","across","after","all","almost","also","am","among","an","and","any","are","as","at","be","because","been","but","by","can","cannot","could","dear","did","do","does","either","else","ever","every","for","from","get","got","had","has","have","he","her","hers","him","his","how","however","i","if","in","into","is","it","its","just","least","let","like","likely","may","me","might","most","must","my","neither","no","nor","not","of","off","often","on","only","or","other","our","own","rather","said","say","says","she","should","since","so","some","than","that","the","their","them","then","there","these","they","this","tis","to","too","twas","us","wants","was","we","were","what","when","where","which","while","who","whom","why","will","with","would","yet","you","your"]),t.Pipeline.registerFunction(t.stopWordFilter,"stopWordFilter");/*!
+ * lunr.trimmer
+ * Copyright (C) 2020 Oliver Nightingale
+ */t.trimmer=function(e){return e.update(function(r){return r.replace(/^\W+/,"").replace(/\W+$/,"")})},t.Pipeline.registerFunction(t.trimmer,"trimmer");/*!
+ * lunr.TokenSet
+ * Copyright (C) 2020 Oliver Nightingale
+ */t.TokenSet=function(){this.final=!1,this.edges={},this.id=t.TokenSet._nextId,t.TokenSet._nextId+=1},t.TokenSet._nextId=1,t.TokenSet.fromArray=function(e){for(var r=new t.TokenSet.Builder,n=0,i=e.length;n<i;n++)r.insert(e[n]);return r.finish(),r.root},t.TokenSet.fromClause=function(e){return"editDistance"in e?t.TokenSet.fromFuzzyString(e.term,e.editDistance):t.TokenSet.fromString(e.term)},t.TokenSet.fromFuzzyString=function(e,r){for(var n=new t.TokenSet,i=[{node:n,editsRemaining:r,str:e}];i.length;){var s=i.pop();if(s.str.length>0){var o=s.str.charAt(0),a;o in s.node.edges?a=s.node.edges[o]:(a=new t.TokenSet,s.node.edges[o]=a),s.str.length==1&&(a.final=!0),i.push({node:a,editsRemaining:s.editsRemaining,str:s.str.slice(1)})}if(s.editsRemaining!=0){if("*"in s.node.edges)var u=s.node.edges["*"];else{var u=new t.TokenSet;s.node.edges["*"]=u}if(s.str.length==0&&(u.final=!0),i.push({node:u,editsRemaining:s.editsRemaining-1,str:s.str}),s.str.length>1&&i.push({node:s.node,editsRemaining:s.editsRemaining-1,str:s.str.slice(1)}),s.str.length==1&&(s.node.final=!0),s.str.length>=1){if("*"in s.node.edges)var c=s.node.edges["*"];else{var c=new t.TokenSet;s.node.edges["*"]=c}s.str.length==1&&(c.final=!0),i.push({node:c,editsRemaining:s.editsRemaining-1,str:s.str.slice(1)})}if(s.str.length>1){var f=s.str.charAt(0),g=s.str.charAt(1),l;g in s.node.edges?l=s.node.edges[g]:(l=new t.TokenSet,s.node.edges[g]=l),s.str.length==1&&(l.final=!0),i.push({node:l,editsRemaining:s.editsRemaining-1,str:f+s.str.slice(2)})}}}return n},t.TokenSet.fromString=function(e){for(var r=new t.TokenSet,n=r,i=0,s=e.length;i<s;i++){var o=e[i],a=i==s-1;if(o=="*")r.edges[o]=r,r.final=a;else{var u=new t.TokenSet;u.final=a,r.edges[o]=u,r=u}}return n},t.TokenSet.prototype.toArray=function(){for(var e=[],r=[{prefix:"",node:this}];r.length;){var n=r.pop(),i=Object.keys(n.node.edges),s=i.length;n.node.final&&(n.prefix.charAt(0),e.push(n.prefix));for(var o=0;o<s;o++){var a=i[o];r.push({prefix:n.prefix.concat(a),node:n.node.edges[a]})}}return e},t.TokenSet.prototype.toString=function(){if(this._str)return this._str;for(var e=this.final?"1":"0",r=Object.keys(this.edges).sort(),n=r.length,i=0;i<n;i++){var s=r[i],o=this.edges[s];e=e+s+o.id}return e},t.TokenSet.prototype.intersect=function(e){for(var r=new t.TokenSet,n=void 0,i=[{qNode:e,output:r,node:this}];i.length;){n=i.pop();for(var s=Object.keys(n.qNode.edges),o=s.length,a=Object.keys(n.node.edges),u=a.length,c=0;c<o;c++)for(var f=s[c],g=0;g<u;g++){var l=a[g];if(l==f||f=="*"){var m=n.node.edges[l],x=n.qNode.edges[f],v=m.final&&x.final,d=void 0;l in n.output.edges?(d=n.output.edges[l],d.final=d.final||v):(d=new t.TokenSet,d.final=v,n.output.edges[l]=d),i.push({qNode:x,output:d,node:m})}}}return r},t.TokenSet.Builder=function(){this.previousWord="",this.root=new t.TokenSet,this.uncheckedNodes=[],this.minimizedNodes={}},t.TokenSet.Builder.prototype.insert=function(e){var r,n=0;if(e<this.previousWord)throw new Error("Out of order word insertion");for(var i=0;i<e.length&&i<this.previousWord.length&&e[i]==this.previousWord[i];i++)n++;this.minimize(n),this.uncheckedNodes.length==0?r=this.root:r=this.uncheckedNodes[this.uncheckedNodes.length-1].child;for(var i=n;i<e.length;i++){var s=new t.TokenSet,o=e[i];r.edges[o]=s,this.uncheckedNodes.push({parent:r,char:o,child:s}),r=s}r.final=!0,this.previousWord=e},t.TokenSet.Builder.prototype.finish=function(){this.minimize(0)},t.TokenSet.Builder.prototype.minimize=function(e){for(var r=this.uncheckedNodes.length-1;r>=e;r--){var n=this.uncheckedNodes[r],i=n.child.toString();i in this.minimizedNodes?n.parent.edges[n.char]=this.minimizedNodes[i]:(n.child._str=i,this.minimizedNodes[i]=n.child),this.uncheckedNodes.pop()}};/*!
+ * lunr.Index
+ * Copyright (C) 2020 Oliver Nightingale
+ */t.Index=function(e){this.invertedIndex=e.invertedIndex,this.fieldVectors=e.fieldVectors,this.tokenSet=e.tokenSet,this.fields=e.fields,this.pipeline=e.pipeline},t.Index.prototype.search=function(e){return this.query(function(r){var n=new t.QueryParser(e,r);n.parse()})},t.Index.prototype.query=function(e){for(var r=new t.Query(this.fields),n=Object.create(null),i=Object.create(null),s=Object.create(null),o=Object.create(null),a=Object.create(null),u=0;u<this.fields.length;u++)i[this.fields[u]]=new t.Vector;e.call(r,r);for(var u=0;u<r.clauses.length;u++){var c=r.clauses[u],f=null,g=t.Set.empty;c.usePipeline?f=this.pipeline.runString(c.term,{fields:c.fields}):f=[c.term];for(var l=0;l<f.length;l++){var m=f[l];c.term=m;var x=t.TokenSet.fromClause(c),v=this.tokenSet.intersect(x).toArray();if(v.length===0&&c.presence===t.Query.presence.REQUIRED){for(var d=0;d<c.fields.length;d++){var y=c.fields[d];o[y]=t.Set.empty}break}for(var b=0;b<v.length;b++)for(var E=v[b],w=this.invertedIndex[E],R=w._index,d=0;d<c.fields.length;d++){var y=c.fields[d],j=w[y],_=Object.keys(j),D=E+"/"+y,N=new t.Set(_);if(c.presence==t.Query.presence.REQUIRED&&(g=g.union(N),o[y]===void 0&&(o[y]=t.Set.complete)),c.presence==t.Query.presence.PROHIBITED){a[y]===void 0&&(a[y]=t.Set.empty),a[y]=a[y].union(N);continue}if(i[y].upsert(R,c.boost,function(ye,me){return ye+me}),!s[D]){for(var C=0;C<_.length;C++){var V=_[C],P=new t.FieldRef(V,y),z=j[V],$;($=n[P])===void 0?n[P]=new t.MatchData(E,y,z):$.add(E,y,z)}s[D]=!0}}}if(c.presence===t.Query.presence.REQUIRED)for(var d=0;d<c.fields.length;d++){var y=c.fields[d];o[y]=o[y].intersect(g)}}for(var M=t.Set.complete,I=t.Set.empty,u=0;u<this.fields.length;u++){var y=this.fields[u];o[y]&&(M=M.intersect(o[y])),a[y]&&(I=I.union(a[y]))}var h=Object.keys(n),S=[],k=Object.create(null);if(r.isNegated()){h=Object.keys(this.fieldVectors);for(var u=0;u<h.length;u++){var P=h[u],L=t.FieldRef.fromString(P);n[P]=new t.MatchData}}for(var u=0;u<h.length;u++){var L=t.FieldRef.fromString(h[u]),p=L.docRef;if(M.contains(p)&&!I.contains(p)){var T=this.fieldVectors[L],O=i[L.fieldName].similarity(T),F;if((F=k[p])!==void 0)F.score+=O,F.matchData.combine(n[L]);else{var Q={ref:p,score:O,matchData:n[L]};k[p]=Q,S.push(Q)}}}return S.sort(function(pe,ge){return ge.score-pe.score})},t.Index.prototype.toJSON=function(){var e=Object.keys(this.invertedIndex).sort().map(function(n){return[n,this.invertedIndex[n]]},this),r=Object.keys(this.fieldVectors).map(function(n){return[n,this.fieldVectors[n].toJSON()]},this);return{version:t.version,fields:this.fields,fieldVectors:r,invertedIndex:e,pipeline:this.pipeline.toJSON()}},t.Index.load=function(e){var r={},n={},i=e.fieldVectors,s=Object.create(null),o=e.invertedIndex,a=new t.TokenSet.Builder,u=t.Pipeline.load(e.pipeline);e.version!=t.version&&t.utils.warn("Version mismatch when loading serialised index. Current version of lunr '"+t.version+"' does not match serialized index '"+e.version+"'");for(var c=0;c<i.length;c++){var f=i[c],g=f[0],l=f[1];n[g]=new t.Vector(l)}for(var c=0;c<o.length;c++){var f=o[c],m=f[0],x=f[1];a.insert(m),s[m]=x}return a.finish(),r.fields=e.fields,r.fieldVectors=n,r.invertedIndex=s,r.tokenSet=a.root,r.pipeline=u,new t.Index(r)};/*!
+ * lunr.Builder
+ * Copyright (C) 2020 Oliver Nightingale
+ */t.Builder=function(){this._ref="id",this._fields=Object.create(null),this._documents=Object.create(null),this.invertedIndex=Object.create(null),this.fieldTermFrequencies={},this.fieldLengths={},this.tokenizer=t.tokenizer,this.pipeline=new t.Pipeline,this.searchPipeline=new t.Pipeline,this.documentCount=0,this._b=.75,this._k1=1.2,this.termIndex=0,this.metadataWhitelist=[]},t.Builder.prototype.ref=function(e){this._ref=e},t.Builder.prototype.field=function(e,r){if(/\//.test(e))throw new RangeError("Field '"+e+"' contains illegal character '/'");this._fields[e]=r||{}},t.Builder.prototype.b=function(e){e<0?this._b=0:e>1?this._b=1:this._b=e},t.Builder.prototype.k1=function(e){this._k1=e},t.Builder.prototype.add=function(e,r){var n=e[this._ref],i=Object.keys(this._fields);this._documents[n]=r||{},this.documentCount+=1;for(var s=0;s<i.length;s++){var o=i[s],a=this._fields[o].extractor,u=a?a(e):e[o],c=this.tokenizer(u,{fields:[o]}),f=this.pipeline.run(c),g=new t.FieldRef(n,o),l=Object.create(null);this.fieldTermFrequencies[g]=l,this.fieldLengths[g]=0,this.fieldLengths[g]+=f.length;for(var m=0;m<f.length;m++){var x=f[m];if(l[x]==null&&(l[x]=0),l[x]+=1,this.invertedIndex[x]==null){var v=Object.create(null);v._index=this.termIndex,this.termIndex+=1;for(var d=0;d<i.length;d++)v[i[d]]=Object.create(null);this.invertedIndex[x]=v}this.invertedIndex[x][o][n]==null&&(this.invertedIndex[x][o][n]=Object.create(null));for(var y=0;y<this.metadataWhitelist.length;y++){var b=this.metadataWhitelist[y],E=x.metadata[b];this.invertedIndex[x][o][n][b]==null&&(this.invertedIndex[x][o][n][b]=[]),this.invertedIndex[x][o][n][b].push(E)}}}},t.Builder.prototype.calculateAverageFieldLengths=function(){for(var e=Object.keys(this.fieldLengths),r=e.length,n={},i={},s=0;s<r;s++){var o=t.FieldRef.fromString(e[s]),a=o.fieldName;i[a]||(i[a]=0),i[a]+=1,n[a]||(n[a]=0),n[a]+=this.fieldLengths[o]}for(var u=Object.keys(this._fields),s=0;s<u.length;s++){var c=u[s];n[c]=n[c]/i[c]}this.averageFieldLength=n},t.Builder.prototype.createFieldVectors=function(){for(var e={},r=Object.keys(this.fieldTermFrequencies),n=r.length,i=Object.create(null),s=0;s<n;s++){for(var o=t.FieldRef.fromString(r[s]),a=o.fieldName,u=this.fieldLengths[o],c=new t.Vector,f=this.fieldTermFrequencies[o],g=Object.keys(f),l=g.length,m=this._fields[a].boost||1,x=this._documents[o.docRef].boost||1,v=0;v<l;v++){var d=g[v],y=f[d],b=this.invertedIndex[d]._index,E,w,R;i[d]===void 0?(E=t.idf(this.invertedIndex[d],this.documentCount),i[d]=E):E=i[d],w=E*((this._k1+1)*y)/(this._k1*(1-this._b+this._b*(u/this.averageFieldLength[a]))+y),w*=m,w*=x,R=Math.round(w*1e3)/1e3,c.insert(b,R)}e[o]=c}this.fieldVectors=e},t.Builder.prototype.createTokenSet=function(){this.tokenSet=t.TokenSet.fromArray(Object.keys(this.invertedIndex).sort())},t.Builder.prototype.build=function(){return this.calculateAverageFieldLengths(),this.createFieldVectors(),this.createTokenSet(),new t.Index({invertedIndex:this.invertedIndex,fieldVectors:this.fieldVectors,tokenSet:this.tokenSet,fields:Object.keys(this._fields),pipeline:this.searchPipeline})},t.Builder.prototype.use=function(e){var r=Array.prototype.slice.call(arguments,1);r.unshift(this),e.apply(this,r)},t.MatchData=function(e,r,n){for(var i=Object.create(null),s=Object.keys(n||{}),o=0;o<s.length;o++){var a=s[o];i[a]=n[a].slice()}this.metadata=Object.create(null),e!==void 0&&(this.metadata[e]=Object.create(null),this.metadata[e][r]=i)},t.MatchData.prototype.combine=function(e){for(var r=Object.keys(e.metadata),n=0;n<r.length;n++){var i=r[n],s=Object.keys(e.metadata[i]);this.metadata[i]==null&&(this.metadata[i]=Object.create(null));for(var o=0;o<s.length;o++){var a=s[o],u=Object.keys(e.metadata[i][a]);this.metadata[i][a]==null&&(this.metadata[i][a]=Object.create(null));for(var c=0;c<u.length;c++){var f=u[c];this.metadata[i][a][f]==null?this.metadata[i][a][f]=e.metadata[i][a][f]:this.metadata[i][a][f]=this.metadata[i][a][f].concat(e.metadata[i][a][f])}}}},t.MatchData.prototype.add=function(e,r,n){if(!(e in this.metadata)){this.metadata[e]=Object.create(null),this.metadata[e][r]=n;return}if(!(r in this.metadata[e])){this.metadata[e][r]=n;return}for(var i=Object.keys(n),s=0;s<i.length;s++){var o=i[s];o in this.metadata[e][r]?this.metadata[e][r][o]=this.metadata[e][r][o].concat(n[o]):this.metadata[e][r][o]=n[o]}},t.Query=function(e){this.clauses=[],this.allFields=e},t.Query.wildcard=new String("*"),t.Query.wildcard.NONE=0,t.Query.wildcard.LEADING=1,t.Query.wildcard.TRAILING=2,t.Query.presence={OPTIONAL:1,REQUIRED:2,PROHIBITED:3},t.Query.prototype.clause=function(e){return"fields"in e||(e.fields=this.allFields),"boost"in e||(e.boost=1),"usePipeline"in e||(e.usePipeline=!0),"wildcard"in e||(e.wildcard=t.Query.wildcard.NONE),e.wildcard&t.Query.wildcard.LEADING&&e.term.charAt(0)!=t.Query.wildcard&&(e.term="*"+e.term),e.wildcard&t.Query.wildcard.TRAILING&&e.term.slice(-1)!=t.Query.wildcard&&(e.term=""+e.term+"*"),"presence"in e||(e.presence=t.Query.presence.OPTIONAL),this.clauses.push(e),this},t.Query.prototype.isNegated=function(){for(var e=0;e<this.clauses.length;e++)if(this.clauses[e].presence!=t.Query.presence.PROHIBITED)return!1;return!0},t.Query.prototype.term=function(e,r){if(Array.isArray(e))return e.forEach(function(i){this.term(i,t.utils.clone(r))},this),this;var n=r||{};return n.term=e.toString(),this.clause(n),this},t.QueryParseError=function(e,r,n){this.name="QueryParseError",this.message=e,this.start=r,this.end=n},t.QueryParseError.prototype=new Error,t.QueryLexer=function(e){this.lexemes=[],this.str=e,this.length=e.length,this.pos=0,this.start=0,this.escapeCharPositions=[]},t.QueryLexer.prototype.run=function(){for(var e=t.QueryLexer.lexText;e;)e=e(this)},t.QueryLexer.prototype.sliceString=function(){for(var e=[],r=this.start,n=this.pos,i=0;i<this.escapeCharPositions.length;i++)n=this.escapeCharPositions[i],e.push(this.str.slice(r,n)),r=n+1;return e.push(this.str.slice(r,this.pos)),this.escapeCharPositions.length=0,e.join("")},t.QueryLexer.prototype.emit=function(e){this.lexemes.push({type:e,str:this.sliceString(),start:this.start,end:this.pos}),this.start=this.pos},t.QueryLexer.prototype.escapeCharacter=function(){this.escapeCharPositions.push(this.pos-1),this.pos+=1},t.QueryLexer.prototype.next=function(){if(this.pos>=this.length)return t.QueryLexer.EOS;var e=this.str.charAt(this.pos);return this.pos+=1,e},t.QueryLexer.prototype.width=function(){return this.pos-this.start},t.QueryLexer.prototype.ignore=function(){this.start==this.pos&&(this.pos+=1),this.start=this.pos},t.QueryLexer.prototype.backup=function(){this.pos-=1},t.QueryLexer.prototype.acceptDigitRun=function(){var e,r;do e=this.next(),r=e.charCodeAt(0);while(r>47&&r<58);e!=t.QueryLexer.EOS&&this.backup()},t.QueryLexer.prototype.more=function(){return this.pos<this.length},t.QueryLexer.EOS="EOS",t.QueryLexer.FIELD="FIELD",t.QueryLexer.TERM="TERM",t.QueryLexer.EDIT_DISTANCE="EDIT_DISTANCE",t.QueryLexer.BOOST="BOOST",t.QueryLexer.PRESENCE="PRESENCE",t.QueryLexer.lexField=function(e){return e.backup(),e.emit(t.QueryLexer.FIELD),e.ignore(),t.QueryLexer.lexText},t.QueryLexer.lexTerm=function(e){if(e.width()>1&&(e.backup(),e.emit(t.QueryLexer.TERM)),e.ignore(),e.more())return t.QueryLexer.lexText},t.QueryLexer.lexEditDistance=function(e){return e.ignore(),e.acceptDigitRun(),e.emit(t.QueryLexer.EDIT_DISTANCE),t.QueryLexer.lexText},t.QueryLexer.lexBoost=function(e){return e.ignore(),e.acceptDigitRun(),e.emit(t.QueryLexer.BOOST),t.QueryLexer.lexText},t.QueryLexer.lexEOS=function(e){e.width()>0&&e.emit(t.QueryLexer.TERM)},t.QueryLexer.termSeparator=t.tokenizer.separator,t.QueryLexer.lexText=function(e){for(;;){var r=e.next();if(r==t.QueryLexer.EOS)return t.QueryLexer.lexEOS;if(r.charCodeAt(0)==92){e.escapeCharacter();continue}if(r==":")return t.QueryLexer.lexField;if(r=="~")return e.backup(),e.width()>0&&e.emit(t.QueryLexer.TERM),t.QueryLexer.lexEditDistance;if(r=="^")return e.backup(),e.width()>0&&e.emit(t.QueryLexer.TERM),t.QueryLexer.lexBoost;if(r=="+"&&e.width()===1||r=="-"&&e.width()===1)return e.emit(t.QueryLexer.PRESENCE),t.QueryLexer.lexText;if(r.match(t.QueryLexer.termSeparator))return t.QueryLexer.lexTerm}},t.QueryParser=function(e,r){this.lexer=new t.QueryLexer(e),this.query=r,this.currentClause={},this.lexemeIdx=0},t.QueryParser.prototype.parse=function(){this.lexer.run(),this.lexemes=this.lexer.lexemes;for(var e=t.QueryParser.parseClause;e;)e=e(this);return this.query},t.QueryParser.prototype.peekLexeme=function(){return this.lexemes[this.lexemeIdx]},t.QueryParser.prototype.consumeLexeme=function(){var e=this.peekLexeme();return this.lexemeIdx+=1,e},t.QueryParser.prototype.nextClause=function(){var e=this.currentClause;this.query.clause(e),this.currentClause={}},t.QueryParser.parseClause=function(e){var r=e.peekLexeme();if(r!=null)switch(r.type){case t.QueryLexer.PRESENCE:return t.QueryParser.parsePresence;case t.QueryLexer.FIELD:return t.QueryParser.parseField;case t.QueryLexer.TERM:return t.QueryParser.parseTerm;default:var n="expected either a field or a term, found "+r.type;throw r.str.length>=1&&(n+=" with value '"+r.str+"'"),new t.QueryParseError(n,r.start,r.end)}},t.QueryParser.parsePresence=function(e){var r=e.consumeLexeme();if(r!=null){switch(r.str){case"-":e.currentClause.presence=t.Query.presence.PROHIBITED;break;case"+":e.currentClause.presence=t.Query.presence.REQUIRED;break;default:var n="unrecognised presence operator'"+r.str+"'";throw new t.QueryParseError(n,r.start,r.end)}var i=e.peekLexeme();if(i==null){var n="expecting term or field, found nothing";throw new t.QueryParseError(n,r.start,r.end)}switch(i.type){case t.QueryLexer.FIELD:return t.QueryParser.parseField;case t.QueryLexer.TERM:return t.QueryParser.parseTerm;default:var n="expecting term or field, found '"+i.type+"'";throw new t.QueryParseError(n,i.start,i.end)}}},t.QueryParser.parseField=function(e){var r=e.consumeLexeme();if(r!=null){if(e.query.allFields.indexOf(r.str)==-1){var n=e.query.allFields.map(function(o){return"'"+o+"'"}).join(", "),i="unrecognised field '"+r.str+"', possible fields: "+n;throw new t.QueryParseError(i,r.start,r.end)}e.currentClause.fields=[r.str];var s=e.peekLexeme();if(s==null){var i="expecting term, found nothing";throw new t.QueryParseError(i,r.start,r.end)}switch(s.type){case t.QueryLexer.TERM:return t.QueryParser.parseTerm;default:var i="expecting term, found '"+s.type+"'";throw new t.QueryParseError(i,s.start,s.end)}}},t.QueryParser.parseTerm=function(e){var r=e.consumeLexeme();if(r!=null){e.currentClause.term=r.str.toLowerCase(),r.str.indexOf("*")!=-1&&(e.currentClause.usePipeline=!1);var n=e.peekLexeme();if(n==null){e.nextClause();return}switch(n.type){case t.QueryLexer.TERM:return e.nextClause(),t.QueryParser.parseTerm;case t.QueryLexer.FIELD:return e.nextClause(),t.QueryParser.parseField;case t.QueryLexer.EDIT_DISTANCE:return t.QueryParser.parseEditDistance;case t.QueryLexer.BOOST:return t.QueryParser.parseBoost;case t.QueryLexer.PRESENCE:return e.nextClause(),t.QueryParser.parsePresence;default:var i="Unexpected lexeme type '"+n.type+"'";throw new t.QueryParseError(i,n.start,n.end)}}},t.QueryParser.parseEditDistance=function(e){var r=e.consumeLexeme();if(r!=null){var n=parseInt(r.str,10);if(isNaN(n)){var i="edit distance must be numeric";throw new t.QueryParseError(i,r.start,r.end)}e.currentClause.editDistance=n;var s=e.peekLexeme();if(s==null){e.nextClause();return}switch(s.type){case t.QueryLexer.TERM:return e.nextClause(),t.QueryParser.parseTerm;case t.QueryLexer.FIELD:return e.nextClause(),t.QueryParser.parseField;case t.QueryLexer.EDIT_DISTANCE:return t.QueryParser.parseEditDistance;case t.QueryLexer.BOOST:return t.QueryParser.parseBoost;case t.QueryLexer.PRESENCE:return e.nextClause(),t.QueryParser.parsePresence;default:var i="Unexpected lexeme type '"+s.type+"'";throw new t.QueryParseError(i,s.start,s.end)}}},t.QueryParser.parseBoost=function(e){var r=e.consumeLexeme();if(r!=null){var n=parseInt(r.str,10);if(isNaN(n)){var i="boost must be numeric";throw new t.QueryParseError(i,r.start,r.end)}e.currentClause.boost=n;var s=e.peekLexeme();if(s==null){e.nextClause();return}switch(s.type){case t.QueryLexer.TERM:return e.nextClause(),t.QueryParser.parseTerm;case t.QueryLexer.FIELD:return e.nextClause(),t.QueryParser.parseField;case t.QueryLexer.EDIT_DISTANCE:return t.QueryParser.parseEditDistance;case t.QueryLexer.BOOST:return t.QueryParser.parseBoost;case t.QueryLexer.PRESENCE:return e.nextClause(),t.QueryParser.parsePresence;default:var i="Unexpected lexeme type '"+s.type+"'";throw new t.QueryParseError(i,s.start,s.end)}}},function(e,r){typeof define=="function"&&define.amd?define(r):typeof K=="object"?ee.exports=r():e.lunr=r()}(this,function(){return t})})()});var de=Pe(te());function re(t,e=document){let r=ke(t,e);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${t}" to be present`);return r}function ke(t,e=document){return e.querySelector(t)||void 0}Object.entries||(Object.entries=function(t){let e=[];for(let r of Object.keys(t))e.push([r,t[r]]);return e});Object.values||(Object.values=function(t){let e=[];for(let r of Object.keys(t))e.push(t[r]);return e});typeof Element!="undefined"&&(Element.prototype.scrollTo||(Element.prototype.scrollTo=function(t,e){typeof t=="object"?(this.scrollLeft=t.left,this.scrollTop=t.top):(this.scrollLeft=t,this.scrollTop=e)}),Element.prototype.replaceWith||(Element.prototype.replaceWith=function(...t){let e=this.parentNode;if(e){t.length===0&&e.removeChild(this);for(let r=t.length-1;r>=0;r--){let n=t[r];typeof n=="string"?n=document.createTextNode(n):n.parentNode&&n.parentNode.removeChild(n),r?e.insertBefore(this.previousSibling,n):e.replaceChild(n,this)}}}));function ne(t){let e=new Map;for(let r of t){let[n]=r.location.split("#"),i=e.get(n);typeof i=="undefined"?e.set(n,r):(e.set(r.location,r),r.parent=i)}return e}function W(t,e,r){var s;e=new RegExp(e,"g");let n,i=0;do{n=e.exec(t);let o=(s=n==null?void 0:n.index)!=null?s:t.length;if(i<o&&r(i,o),n){let[a]=n;i=n.index+a.length,a.length===0&&(e.lastIndex=n.index+1)}}while(n)}function ie(t,e){let r=0,n=0,i=0;for(let s=0;i<t.length;i++)t.charAt(i)==="<"&&i>n?e(r,1,n,n=i):t.charAt(i)===">"&&(t.charAt(n+1)==="/"?--s===0&&e(r++,2,n,i+1):t.charAt(i-1)!=="/"&&s++===0&&e(r,0,n,i+1),n=i+1);i>n&&e(r,1,n,i)}function se(t,e,r,n=!1){return q([t],e,r,n).pop()}function q(t,e,r,n=!1){let i=[0];for(let s=1;s<e.length;s++){let o=e[s-1],a=e[s],u=o[o.length-1]>>>2&1023,c=a[0]>>>12;i.push(+(u>c)+i[i.length-1])}return t.map((s,o)=>{let a=0,u=new Map;for(let f of r.sort((g,l)=>g-l)){let g=f&1048575,l=f>>>20;if(i[l]!==o)continue;let m=u.get(l);typeof m=="undefined"&&u.set(l,m=[]),m.push(g)}if(u.size===0)return s;let c=[];for(let[f,g]of u){let l=e[f],m=l[0]>>>12,x=l[l.length-1]>>>12,v=l[l.length-1]>>>2&1023;n&&m>a&&c.push(s.slice(a,m));let d=s.slice(m,x+v);for(let y of g.sort((b,E)=>E-b)){let b=(l[y]>>>12)-m,E=(l[y]>>>2&1023)+b;d=[d.slice(0,b),"<mark>",d.slice(b,E),"</mark>",d.slice(E)].join("")}if(a=x+v,c.push(d)===2)break}return n&&a<s.length&&c.push(s.slice(a)),c.join("")})}function oe(t){let e=[];if(typeof t=="undefined")return e;let r=Array.isArray(t)?t:[t];for(let n=0;n<r.length;n++){let i=lunr.tokenizer.table,s=i.length;ie(r[n],(o,a,u,c)=>{var f;switch(i[f=o+=s]||(i[f]=[]),a){case 0:case 2:i[o].push(u<<12|c-u<<2|a);break;case 1:let g=r[n].slice(u,c);W(g,lunr.tokenizer.separator,(l,m)=>{if(typeof lunr.segmenter!="undefined"){let x=g.slice(l,m);if(/^[MHIK]$/.test(lunr.segmenter.ctype_(x))){let v=lunr.segmenter.segment(x);for(let d=0,y=0;d<v.length;d++)i[o]||(i[o]=[]),i[o].push(u+l+y<<12|v[d].length<<2|a),e.push(new lunr.Token(v[d].toLowerCase(),{position:o<<20|i[o].length-1})),y+=v[d].length;return}}i[o].push(u+l<<12|m-l<<2|a),e.push(new lunr.Token(g.slice(l,m).toLowerCase(),{position:o<<20|i[o].length-1}))})}})}return e}function ae(t,e=r=>r){return t.trim().split(/"([^"]+)"/g).map((r,n)=>n&1?r.replace(/^\b|^(?![^\x00-\x7F]|$)|\s+/g," +"):r).join("").replace(/"|(?:^|\s+)[*+\-:^~]+(?=\s+|$)/g,"").split(/\s+/g).reduce((r,n)=>{let i=e(n);return[...r,...Array.isArray(i)?i:[i]]},[]).map(r=>/([~^]$)/.test(r)?`${r}1`:r).map(r=>/(^[+-]|[~^]\d+$)/.test(r)?r:`${r}*`).join(" ")}function ue(t){return ae(t,e=>{let r=[],n=new lunr.QueryLexer(e);n.run();for(let{type:i,str:s,start:o,end:a}of n.lexemes)switch(i){case"FIELD":["title","text","tags"].includes(s)||(e=[e.slice(0,a)," ",e.slice(a+1)].join(""));break;case"TERM":W(s,lunr.tokenizer.separator,(...u)=>{r.push([e.slice(0,o),s.slice(...u),e.slice(a)].join(""))})}return r})}function ce(t){let e=new lunr.Query(["title","text","tags"]);new lunr.QueryParser(t,e).parse();for(let n of e.clauses)n.usePipeline=!0,n.term.startsWith("*")&&(n.wildcard=lunr.Query.wildcard.LEADING,n.term=n.term.slice(1)),n.term.endsWith("*")&&(n.wildcard=lunr.Query.wildcard.TRAILING,n.term=n.term.slice(0,-1));return e.clauses}function le(t,e){var i;let r=new Set(t),n={};for(let s=0;s<e.length;s++)for(let o of r)e[s].startsWith(o.term)&&(n[o.term]=!0,r.delete(o));for(let s of r)(i=lunr.stopWordFilter)!=null&&i.call(lunr,s.term)&&(n[s.term]=!1);return n}function he(t,e){let r=new Set,n=new Uint16Array(t.length);for(let s=0;s<t.length;s++)for(let o=s+1;o<t.length;o++)t.slice(s,o)in e&&(n[s]=o-s);let i=[0];for(let s=i.length;s>0;){let o=i[--s];for(let u=1;u<n[o];u++)n[o+u]>n[o]-u&&(r.add(t.slice(o,o+u)),i[s++]=o+u);let a=o+n[o];n[a]&&a<t.length-1&&(i[s++]=a),r.add(t.slice(o,a))}return r.has("")?new Set([t]):r}function Oe(t){return e=>r=>{if(typeof r[e]=="undefined")return;let n=[r.location,e].join(":");return t.set(n,lunr.tokenizer.table=[]),r[e]}}function Re(t,e){let[r,n]=[new Set(t),new Set(e)];return[...new Set([...r].filter(i=>!n.has(i)))]}var H=class{constructor({config:e,docs:r,options:n}){let i=Oe(this.table=new Map);this.map=ne(r),this.options=n,this.index=lunr(function(){this.metadataWhitelist=["position"],this.b(0),e.lang.length===1&&e.lang[0]!=="en"?this.use(lunr[e.lang[0]]):e.lang.length>1&&this.use(lunr.multiLanguage(...e.lang)),this.tokenizer=oe,lunr.tokenizer.separator=new RegExp(e.separator),lunr.segmenter="TinySegmenter"in lunr?new lunr.TinySegmenter:void 0;let s=Re(["trimmer","stopWordFilter","stemmer"],e.pipeline);for(let o of e.lang.map(a=>a==="en"?lunr:lunr[a]))for(let a of s)this.pipeline.remove(o[a]),this.searchPipeline.remove(o[a]);this.ref("location"),this.field("title",{boost:1e3,extractor:i("title")}),this.field("text",{boost:1,extractor:i("text")}),this.field("tags",{boost:1e6,extractor:i("tags")});for(let o of r)this.add(o,{boost:o.boost})})}search(e){if(e=e.replace(new RegExp("\\p{sc=Han}+","gu"),s=>[...he(s,this.index.invertedIndex)].join("* ")),e=ue(e),!e)return{items:[]};let r=ce(e).filter(s=>s.presence!==lunr.Query.presence.PROHIBITED),n=this.index.search(e).reduce((s,{ref:o,score:a,matchData:u})=>{let c=this.map.get(o);if(typeof c!="undefined"){c=A({},c),c.tags&&(c.tags=[...c.tags]);let f=le(r,Object.keys(u.metadata));for(let l of this.index.fields){if(typeof c[l]=="undefined")continue;let m=[];for(let d of Object.values(u.metadata))typeof d[l]!="undefined"&&m.push(...d[l].position);if(!m.length)continue;let x=this.table.get([c.location,l].join(":")),v=Array.isArray(c[l])?q:se;c[l]=v(c[l],x,m,l!=="text")}let g=+!c.parent+Object.values(f).filter(l=>l).length/Object.keys(f).length;s.push(G(A({},c),{score:a*(1+Z(g,2)),terms:f}))}return s},[]).sort((s,o)=>o.score-s.score).reduce((s,o)=>{let a=this.map.get(o.location);if(typeof a!="undefined"){let u=a.parent?a.parent.location:a.location;s.set(u,[...s.get(u)||[],o])}return s},new Map);for(let[s,o]of n)if(!o.find(a=>a.location===s)){let a=this.map.get(s);o.push(G(A({},a),{score:0,terms:{}}))}let i;if(this.options.suggest){let s=this.index.query(o=>{for(let a of r)o.term(a.term,{fields:["title"],presence:lunr.Query.presence.REQUIRED,wildcard:lunr.Query.wildcard.TRAILING})});i=s.length?Object.keys(s[0].matchData.metadata):[]}return A({items:[...n.values()]},typeof i!="undefined"&&{suggest:i})}};var fe;function Ie(t){return B(this,null,function*(){let e="../lunr";if(typeof parent!="undefined"&&"IFrameWorker"in parent){let n=re("script[src]"),[i]=n.src.split("/worker");e=e.replace("..",i)}let r=[];for(let n of t.lang){switch(n){case"ja":r.push(`${e}/tinyseg.js`);break;case"hi":case"th":r.push(`${e}/wordcut.js`);break}n!=="en"&&r.push(`${e}/min/lunr.${n}.min.js`)}t.lang.length>1&&r.push(`${e}/min/lunr.multi.min.js`),r.length&&(yield importScripts(`${e}/min/lunr.stemmer.support.min.js`,...r))})}function Fe(t){return B(this,null,function*(){switch(t.type){case 0:return yield Ie(t.data.config),fe=new H(t.data),{type:1};case 2:let e=t.data;try{return{type:3,data:fe.search(e)}}catch(r){return console.warn(`Invalid query: ${e} \u2013 see https://bit.ly/2s3ChXG`),console.warn(r),{type:3,data:{items:[]}}}default:throw new TypeError("Invalid message type")}})}self.lunr=de.default;addEventListener("message",t=>B(void 0,null,function*(){postMessage(yield Fe(t.data))}));})();
+//# sourceMappingURL=search.b8dbb3d2.min.js.map
+
diff --git a/assets/javascripts/workers/search.b8dbb3d2.min.js.map b/assets/javascripts/workers/search.b8dbb3d2.min.js.map
new file mode 100644
index 0000000..7ef0846
--- /dev/null
+++ b/assets/javascripts/workers/search.b8dbb3d2.min.js.map
@@ -0,0 +1,7 @@
+{
+  "version": 3,
+  "sources": ["node_modules/lunr/lunr.js", "src/templates/assets/javascripts/integrations/search/worker/main/index.ts", "src/templates/assets/javascripts/browser/element/_/index.ts", "src/templates/assets/javascripts/polyfills/index.ts", "src/templates/assets/javascripts/integrations/search/config/index.ts", "src/templates/assets/javascripts/integrations/search/internal/_/index.ts", "src/templates/assets/javascripts/integrations/search/internal/extract/index.ts", "src/templates/assets/javascripts/integrations/search/internal/highlight/index.ts", "src/templates/assets/javascripts/integrations/search/internal/tokenize/index.ts", "src/templates/assets/javascripts/integrations/search/query/transform/index.ts", "src/templates/assets/javascripts/integrations/search/query/_/index.ts", "src/templates/assets/javascripts/integrations/search/query/segment/index.ts", "src/templates/assets/javascripts/integrations/search/_/index.ts"],
+  "sourcesContent": ["/**\n * lunr - http://lunrjs.com - A bit like Solr, but much smaller and not as bright - 2.3.9\n * Copyright (C) 2020 Oliver Nightingale\n * @license MIT\n */\n\n;(function(){\n\n/**\n * A convenience function for configuring and constructing\n * a new lunr Index.\n *\n * A lunr.Builder instance is created and the pipeline setup\n * with a trimmer, stop word filter and stemmer.\n *\n * This builder object is yielded to the configuration function\n * that is passed as a parameter, allowing the list of fields\n * and other builder parameters to be customised.\n *\n * All documents _must_ be added within the passed config function.\n *\n * @example\n * var idx = lunr(function () {\n *   this.field('title')\n *   this.field('body')\n *   this.ref('id')\n *\n *   documents.forEach(function (doc) {\n *     this.add(doc)\n *   }, this)\n * })\n *\n * @see {@link lunr.Builder}\n * @see {@link lunr.Pipeline}\n * @see {@link lunr.trimmer}\n * @see {@link lunr.stopWordFilter}\n * @see {@link lunr.stemmer}\n * @namespace {function} lunr\n */\nvar lunr = function (config) {\n  var builder = new lunr.Builder\n\n  builder.pipeline.add(\n    lunr.trimmer,\n    lunr.stopWordFilter,\n    lunr.stemmer\n  )\n\n  builder.searchPipeline.add(\n    lunr.stemmer\n  )\n\n  config.call(builder, builder)\n  return builder.build()\n}\n\nlunr.version = \"2.3.9\"\n/*!\n * lunr.utils\n * Copyright (C) 2020 Oliver Nightingale\n */\n\n/**\n * A namespace containing utils for the rest of the lunr library\n * @namespace lunr.utils\n */\nlunr.utils = {}\n\n/**\n * Print a warning message to the console.\n *\n * @param {String} message The message to be printed.\n * @memberOf lunr.utils\n * @function\n */\nlunr.utils.warn = (function (global) {\n  /* eslint-disable no-console */\n  return function (message) {\n    if (global.console && console.warn) {\n      console.warn(message)\n    }\n  }\n  /* eslint-enable no-console */\n})(this)\n\n/**\n * Convert an object to a string.\n *\n * In the case of `null` and `undefined` the function returns\n * the empty string, in all other cases the result of calling\n * `toString` on the passed object is returned.\n *\n * @param {Any} obj The object to convert to a string.\n * @return {String} string representation of the passed object.\n * @memberOf lunr.utils\n */\nlunr.utils.asString = function (obj) {\n  if (obj === void 0 || obj === null) {\n    return \"\"\n  } else {\n    return obj.toString()\n  }\n}\n\n/**\n * Clones an object.\n *\n * Will create a copy of an existing object such that any mutations\n * on the copy cannot affect the original.\n *\n * Only shallow objects are supported, passing a nested object to this\n * function will cause a TypeError.\n *\n * Objects with primitives, and arrays of primitives are supported.\n *\n * @param {Object} obj The object to clone.\n * @return {Object} a clone of the passed object.\n * @throws {TypeError} when a nested object is passed.\n * @memberOf Utils\n */\nlunr.utils.clone = function (obj) {\n  if (obj === null || obj === undefined) {\n    return obj\n  }\n\n  var clone = Object.create(null),\n      keys = Object.keys(obj)\n\n  for (var i = 0; i < keys.length; i++) {\n    var key = keys[i],\n        val = obj[key]\n\n    if (Array.isArray(val)) {\n      clone[key] = val.slice()\n      continue\n    }\n\n    if (typeof val === 'string' ||\n        typeof val === 'number' ||\n        typeof val === 'boolean') {\n      clone[key] = val\n      continue\n    }\n\n    throw new TypeError(\"clone is not deep and does not support nested objects\")\n  }\n\n  return clone\n}\nlunr.FieldRef = function (docRef, fieldName, stringValue) {\n  this.docRef = docRef\n  this.fieldName = fieldName\n  this._stringValue = stringValue\n}\n\nlunr.FieldRef.joiner = \"/\"\n\nlunr.FieldRef.fromString = function (s) {\n  var n = s.indexOf(lunr.FieldRef.joiner)\n\n  if (n === -1) {\n    throw \"malformed field ref string\"\n  }\n\n  var fieldRef = s.slice(0, n),\n      docRef = s.slice(n + 1)\n\n  return new lunr.FieldRef (docRef, fieldRef, s)\n}\n\nlunr.FieldRef.prototype.toString = function () {\n  if (this._stringValue == undefined) {\n    this._stringValue = this.fieldName + lunr.FieldRef.joiner + this.docRef\n  }\n\n  return this._stringValue\n}\n/*!\n * lunr.Set\n * Copyright (C) 2020 Oliver Nightingale\n */\n\n/**\n * A lunr set.\n *\n * @constructor\n */\nlunr.Set = function (elements) {\n  this.elements = Object.create(null)\n\n  if (elements) {\n    this.length = elements.length\n\n    for (var i = 0; i < this.length; i++) {\n      this.elements[elements[i]] = true\n    }\n  } else {\n    this.length = 0\n  }\n}\n\n/**\n * A complete set that contains all elements.\n *\n * @static\n * @readonly\n * @type {lunr.Set}\n */\nlunr.Set.complete = {\n  intersect: function (other) {\n    return other\n  },\n\n  union: function () {\n    return this\n  },\n\n  contains: function () {\n    return true\n  }\n}\n\n/**\n * An empty set that contains no elements.\n *\n * @static\n * @readonly\n * @type {lunr.Set}\n */\nlunr.Set.empty = {\n  intersect: function () {\n    return this\n  },\n\n  union: function (other) {\n    return other\n  },\n\n  contains: function () {\n    return false\n  }\n}\n\n/**\n * Returns true if this set contains the specified object.\n *\n * @param {object} object - Object whose presence in this set is to be tested.\n * @returns {boolean} - True if this set contains the specified object.\n */\nlunr.Set.prototype.contains = function (object) {\n  return !!this.elements[object]\n}\n\n/**\n * Returns a new set containing only the elements that are present in both\n * this set and the specified set.\n *\n * @param {lunr.Set} other - set to intersect with this set.\n * @returns {lunr.Set} a new set that is the intersection of this and the specified set.\n */\n\nlunr.Set.prototype.intersect = function (other) {\n  var a, b, elements, intersection = []\n\n  if (other === lunr.Set.complete) {\n    return this\n  }\n\n  if (other === lunr.Set.empty) {\n    return other\n  }\n\n  if (this.length < other.length) {\n    a = this\n    b = other\n  } else {\n    a = other\n    b = this\n  }\n\n  elements = Object.keys(a.elements)\n\n  for (var i = 0; i < elements.length; i++) {\n    var element = elements[i]\n    if (element in b.elements) {\n      intersection.push(element)\n    }\n  }\n\n  return new lunr.Set (intersection)\n}\n\n/**\n * Returns a new set combining the elements of this and the specified set.\n *\n * @param {lunr.Set} other - set to union with this set.\n * @return {lunr.Set} a new set that is the union of this and the specified set.\n */\n\nlunr.Set.prototype.union = function (other) {\n  if (other === lunr.Set.complete) {\n    return lunr.Set.complete\n  }\n\n  if (other === lunr.Set.empty) {\n    return this\n  }\n\n  return new lunr.Set(Object.keys(this.elements).concat(Object.keys(other.elements)))\n}\n/**\n * A function to calculate the inverse document frequency for\n * a posting. This is shared between the builder and the index\n *\n * @private\n * @param {object} posting - The posting for a given term\n * @param {number} documentCount - The total number of documents.\n */\nlunr.idf = function (posting, documentCount) {\n  var documentsWithTerm = 0\n\n  for (var fieldName in posting) {\n    if (fieldName == '_index') continue // Ignore the term index, its not a field\n    documentsWithTerm += Object.keys(posting[fieldName]).length\n  }\n\n  var x = (documentCount - documentsWithTerm + 0.5) / (documentsWithTerm + 0.5)\n\n  return Math.log(1 + Math.abs(x))\n}\n\n/**\n * A token wraps a string representation of a token\n * as it is passed through the text processing pipeline.\n *\n * @constructor\n * @param {string} [str=''] - The string token being wrapped.\n * @param {object} [metadata={}] - Metadata associated with this token.\n */\nlunr.Token = function (str, metadata) {\n  this.str = str || \"\"\n  this.metadata = metadata || {}\n}\n\n/**\n * Returns the token string that is being wrapped by this object.\n *\n * @returns {string}\n */\nlunr.Token.prototype.toString = function () {\n  return this.str\n}\n\n/**\n * A token update function is used when updating or optionally\n * when cloning a token.\n *\n * @callback lunr.Token~updateFunction\n * @param {string} str - The string representation of the token.\n * @param {Object} metadata - All metadata associated with this token.\n */\n\n/**\n * Applies the given function to the wrapped string token.\n *\n * @example\n * token.update(function (str, metadata) {\n *   return str.toUpperCase()\n * })\n *\n * @param {lunr.Token~updateFunction} fn - A function to apply to the token string.\n * @returns {lunr.Token}\n */\nlunr.Token.prototype.update = function (fn) {\n  this.str = fn(this.str, this.metadata)\n  return this\n}\n\n/**\n * Creates a clone of this token. Optionally a function can be\n * applied to the cloned token.\n *\n * @param {lunr.Token~updateFunction} [fn] - An optional function to apply to the cloned token.\n * @returns {lunr.Token}\n */\nlunr.Token.prototype.clone = function (fn) {\n  fn = fn || function (s) { return s }\n  return new lunr.Token (fn(this.str, this.metadata), this.metadata)\n}\n/*!\n * lunr.tokenizer\n * Copyright (C) 2020 Oliver Nightingale\n */\n\n/**\n * A function for splitting a string into tokens ready to be inserted into\n * the search index. Uses `lunr.tokenizer.separator` to split strings, change\n * the value of this property to change how strings are split into tokens.\n *\n * This tokenizer will convert its parameter to a string by calling `toString` and\n * then will split this string on the character in `lunr.tokenizer.separator`.\n * Arrays will have their elements converted to strings and wrapped in a lunr.Token.\n *\n * Optional metadata can be passed to the tokenizer, this metadata will be cloned and\n * added as metadata to every token that is created from the object to be tokenized.\n *\n * @static\n * @param {?(string|object|object[])} obj - The object to convert into tokens\n * @param {?object} metadata - Optional metadata to associate with every token\n * @returns {lunr.Token[]}\n * @see {@link lunr.Pipeline}\n */\nlunr.tokenizer = function (obj, metadata) {\n  if (obj == null || obj == undefined) {\n    return []\n  }\n\n  if (Array.isArray(obj)) {\n    return obj.map(function (t) {\n      return new lunr.Token(\n        lunr.utils.asString(t).toLowerCase(),\n        lunr.utils.clone(metadata)\n      )\n    })\n  }\n\n  var str = obj.toString().toLowerCase(),\n      len = str.length,\n      tokens = []\n\n  for (var sliceEnd = 0, sliceStart = 0; sliceEnd <= len; sliceEnd++) {\n    var char = str.charAt(sliceEnd),\n        sliceLength = sliceEnd - sliceStart\n\n    if ((char.match(lunr.tokenizer.separator) || sliceEnd == len)) {\n\n      if (sliceLength > 0) {\n        var tokenMetadata = lunr.utils.clone(metadata) || {}\n        tokenMetadata[\"position\"] = [sliceStart, sliceLength]\n        tokenMetadata[\"index\"] = tokens.length\n\n        tokens.push(\n          new lunr.Token (\n            str.slice(sliceStart, sliceEnd),\n            tokenMetadata\n          )\n        )\n      }\n\n      sliceStart = sliceEnd + 1\n    }\n\n  }\n\n  return tokens\n}\n\n/**\n * The separator used to split a string into tokens. Override this property to change the behaviour of\n * `lunr.tokenizer` behaviour when tokenizing strings. By default this splits on whitespace and hyphens.\n *\n * @static\n * @see lunr.tokenizer\n */\nlunr.tokenizer.separator = /[\\s\\-]+/\n/*!\n * lunr.Pipeline\n * Copyright (C) 2020 Oliver Nightingale\n */\n\n/**\n * lunr.Pipelines maintain an ordered list of functions to be applied to all\n * tokens in documents entering the search index and queries being ran against\n * the index.\n *\n * An instance of lunr.Index created with the lunr shortcut will contain a\n * pipeline with a stop word filter and an English language stemmer. Extra\n * functions can be added before or after either of these functions or these\n * default functions can be removed.\n *\n * When run the pipeline will call each function in turn, passing a token, the\n * index of that token in the original list of all tokens and finally a list of\n * all the original tokens.\n *\n * The output of functions in the pipeline will be passed to the next function\n * in the pipeline. To exclude a token from entering the index the function\n * should return undefined, the rest of the pipeline will not be called with\n * this token.\n *\n * For serialisation of pipelines to work, all functions used in an instance of\n * a pipeline should be registered with lunr.Pipeline. Registered functions can\n * then be loaded. If trying to load a serialised pipeline that uses functions\n * that are not registered an error will be thrown.\n *\n * If not planning on serialising the pipeline then registering pipeline functions\n * is not necessary.\n *\n * @constructor\n */\nlunr.Pipeline = function () {\n  this._stack = []\n}\n\nlunr.Pipeline.registeredFunctions = Object.create(null)\n\n/**\n * A pipeline function maps lunr.Token to lunr.Token. A lunr.Token contains the token\n * string as well as all known metadata. A pipeline function can mutate the token string\n * or mutate (or add) metadata for a given token.\n *\n * A pipeline function can indicate that the passed token should be discarded by returning\n * null, undefined or an empty string. This token will not be passed to any downstream pipeline\n * functions and will not be added to the index.\n *\n * Multiple tokens can be returned by returning an array of tokens. Each token will be passed\n * to any downstream pipeline functions and all will returned tokens will be added to the index.\n *\n * Any number of pipeline functions may be chained together using a lunr.Pipeline.\n *\n * @interface lunr.PipelineFunction\n * @param {lunr.Token} token - A token from the document being processed.\n * @param {number} i - The index of this token in the complete list of tokens for this document/field.\n * @param {lunr.Token[]} tokens - All tokens for this document/field.\n * @returns {(?lunr.Token|lunr.Token[])}\n */\n\n/**\n * Register a function with the pipeline.\n *\n * Functions that are used in the pipeline should be registered if the pipeline\n * needs to be serialised, or a serialised pipeline needs to be loaded.\n *\n * Registering a function does not add it to a pipeline, functions must still be\n * added to instances of the pipeline for them to be used when running a pipeline.\n *\n * @param {lunr.PipelineFunction} fn - The function to check for.\n * @param {String} label - The label to register this function with\n */\nlunr.Pipeline.registerFunction = function (fn, label) {\n  if (label in this.registeredFunctions) {\n    lunr.utils.warn('Overwriting existing registered function: ' + label)\n  }\n\n  fn.label = label\n  lunr.Pipeline.registeredFunctions[fn.label] = fn\n}\n\n/**\n * Warns if the function is not registered as a Pipeline function.\n *\n * @param {lunr.PipelineFunction} fn - The function to check for.\n * @private\n */\nlunr.Pipeline.warnIfFunctionNotRegistered = function (fn) {\n  var isRegistered = fn.label && (fn.label in this.registeredFunctions)\n\n  if (!isRegistered) {\n    lunr.utils.warn('Function is not registered with pipeline. This may cause problems when serialising the index.\\n', fn)\n  }\n}\n\n/**\n * Loads a previously serialised pipeline.\n *\n * All functions to be loaded must already be registered with lunr.Pipeline.\n * If any function from the serialised data has not been registered then an\n * error will be thrown.\n *\n * @param {Object} serialised - The serialised pipeline to load.\n * @returns {lunr.Pipeline}\n */\nlunr.Pipeline.load = function (serialised) {\n  var pipeline = new lunr.Pipeline\n\n  serialised.forEach(function (fnName) {\n    var fn = lunr.Pipeline.registeredFunctions[fnName]\n\n    if (fn) {\n      pipeline.add(fn)\n    } else {\n      throw new Error('Cannot load unregistered function: ' + fnName)\n    }\n  })\n\n  return pipeline\n}\n\n/**\n * Adds new functions to the end of the pipeline.\n *\n * Logs a warning if the function has not been registered.\n *\n * @param {lunr.PipelineFunction[]} functions - Any number of functions to add to the pipeline.\n */\nlunr.Pipeline.prototype.add = function () {\n  var fns = Array.prototype.slice.call(arguments)\n\n  fns.forEach(function (fn) {\n    lunr.Pipeline.warnIfFunctionNotRegistered(fn)\n    this._stack.push(fn)\n  }, this)\n}\n\n/**\n * Adds a single function after a function that already exists in the\n * pipeline.\n *\n * Logs a warning if the function has not been registered.\n *\n * @param {lunr.PipelineFunction} existingFn - A function that already exists in the pipeline.\n * @param {lunr.PipelineFunction} newFn - The new function to add to the pipeline.\n */\nlunr.Pipeline.prototype.after = function (existingFn, newFn) {\n  lunr.Pipeline.warnIfFunctionNotRegistered(newFn)\n\n  var pos = this._stack.indexOf(existingFn)\n  if (pos == -1) {\n    throw new Error('Cannot find existingFn')\n  }\n\n  pos = pos + 1\n  this._stack.splice(pos, 0, newFn)\n}\n\n/**\n * Adds a single function before a function that already exists in the\n * pipeline.\n *\n * Logs a warning if the function has not been registered.\n *\n * @param {lunr.PipelineFunction} existingFn - A function that already exists in the pipeline.\n * @param {lunr.PipelineFunction} newFn - The new function to add to the pipeline.\n */\nlunr.Pipeline.prototype.before = function (existingFn, newFn) {\n  lunr.Pipeline.warnIfFunctionNotRegistered(newFn)\n\n  var pos = this._stack.indexOf(existingFn)\n  if (pos == -1) {\n    throw new Error('Cannot find existingFn')\n  }\n\n  this._stack.splice(pos, 0, newFn)\n}\n\n/**\n * Removes a function from the pipeline.\n *\n * @param {lunr.PipelineFunction} fn The function to remove from the pipeline.\n */\nlunr.Pipeline.prototype.remove = function (fn) {\n  var pos = this._stack.indexOf(fn)\n  if (pos == -1) {\n    return\n  }\n\n  this._stack.splice(pos, 1)\n}\n\n/**\n * Runs the current list of functions that make up the pipeline against the\n * passed tokens.\n *\n * @param {Array} tokens The tokens to run through the pipeline.\n * @returns {Array}\n */\nlunr.Pipeline.prototype.run = function (tokens) {\n  var stackLength = this._stack.length\n\n  for (var i = 0; i < stackLength; i++) {\n    var fn = this._stack[i]\n    var memo = []\n\n    for (var j = 0; j < tokens.length; j++) {\n      var result = fn(tokens[j], j, tokens)\n\n      if (result === null || result === void 0 || result === '') continue\n\n      if (Array.isArray(result)) {\n        for (var k = 0; k < result.length; k++) {\n          memo.push(result[k])\n        }\n      } else {\n        memo.push(result)\n      }\n    }\n\n    tokens = memo\n  }\n\n  return tokens\n}\n\n/**\n * Convenience method for passing a string through a pipeline and getting\n * strings out. This method takes care of wrapping the passed string in a\n * token and mapping the resulting tokens back to strings.\n *\n * @param {string} str - The string to pass through the pipeline.\n * @param {?object} metadata - Optional metadata to associate with the token\n * passed to the pipeline.\n * @returns {string[]}\n */\nlunr.Pipeline.prototype.runString = function (str, metadata) {\n  var token = new lunr.Token (str, metadata)\n\n  return this.run([token]).map(function (t) {\n    return t.toString()\n  })\n}\n\n/**\n * Resets the pipeline by removing any existing processors.\n *\n */\nlunr.Pipeline.prototype.reset = function () {\n  this._stack = []\n}\n\n/**\n * Returns a representation of the pipeline ready for serialisation.\n *\n * Logs a warning if the function has not been registered.\n *\n * @returns {Array}\n */\nlunr.Pipeline.prototype.toJSON = function () {\n  return this._stack.map(function (fn) {\n    lunr.Pipeline.warnIfFunctionNotRegistered(fn)\n\n    return fn.label\n  })\n}\n/*!\n * lunr.Vector\n * Copyright (C) 2020 Oliver Nightingale\n */\n\n/**\n * A vector is used to construct the vector space of documents and queries. These\n * vectors support operations to determine the similarity between two documents or\n * a document and a query.\n *\n * Normally no parameters are required for initializing a vector, but in the case of\n * loading a previously dumped vector the raw elements can be provided to the constructor.\n *\n * For performance reasons vectors are implemented with a flat array, where an elements\n * index is immediately followed by its value. E.g. [index, value, index, value]. This\n * allows the underlying array to be as sparse as possible and still offer decent\n * performance when being used for vector calculations.\n *\n * @constructor\n * @param {Number[]} [elements] - The flat list of element index and element value pairs.\n */\nlunr.Vector = function (elements) {\n  this._magnitude = 0\n  this.elements = elements || []\n}\n\n\n/**\n * Calculates the position within the vector to insert a given index.\n *\n * This is used internally by insert and upsert. If there are duplicate indexes then\n * the position is returned as if the value for that index were to be updated, but it\n * is the callers responsibility to check whether there is a duplicate at that index\n *\n * @param {Number} insertIdx - The index at which the element should be inserted.\n * @returns {Number}\n */\nlunr.Vector.prototype.positionForIndex = function (index) {\n  // For an empty vector the tuple can be inserted at the beginning\n  if (this.elements.length == 0) {\n    return 0\n  }\n\n  var start = 0,\n      end = this.elements.length / 2,\n      sliceLength = end - start,\n      pivotPoint = Math.floor(sliceLength / 2),\n      pivotIndex = this.elements[pivotPoint * 2]\n\n  while (sliceLength > 1) {\n    if (pivotIndex < index) {\n      start = pivotPoint\n    }\n\n    if (pivotIndex > index) {\n      end = pivotPoint\n    }\n\n    if (pivotIndex == index) {\n      break\n    }\n\n    sliceLength = end - start\n    pivotPoint = start + Math.floor(sliceLength / 2)\n    pivotIndex = this.elements[pivotPoint * 2]\n  }\n\n  if (pivotIndex == index) {\n    return pivotPoint * 2\n  }\n\n  if (pivotIndex > index) {\n    return pivotPoint * 2\n  }\n\n  if (pivotIndex < index) {\n    return (pivotPoint + 1) * 2\n  }\n}\n\n/**\n * Inserts an element at an index within the vector.\n *\n * Does not allow duplicates, will throw an error if there is already an entry\n * for this index.\n *\n * @param {Number} insertIdx - The index at which the element should be inserted.\n * @param {Number} val - The value to be inserted into the vector.\n */\nlunr.Vector.prototype.insert = function (insertIdx, val) {\n  this.upsert(insertIdx, val, function () {\n    throw \"duplicate index\"\n  })\n}\n\n/**\n * Inserts or updates an existing index within the vector.\n *\n * @param {Number} insertIdx - The index at which the element should be inserted.\n * @param {Number} val - The value to be inserted into the vector.\n * @param {function} fn - A function that is called for updates, the existing value and the\n * requested value are passed as arguments\n */\nlunr.Vector.prototype.upsert = function (insertIdx, val, fn) {\n  this._magnitude = 0\n  var position = this.positionForIndex(insertIdx)\n\n  if (this.elements[position] == insertIdx) {\n    this.elements[position + 1] = fn(this.elements[position + 1], val)\n  } else {\n    this.elements.splice(position, 0, insertIdx, val)\n  }\n}\n\n/**\n * Calculates the magnitude of this vector.\n *\n * @returns {Number}\n */\nlunr.Vector.prototype.magnitude = function () {\n  if (this._magnitude) return this._magnitude\n\n  var sumOfSquares = 0,\n      elementsLength = this.elements.length\n\n  for (var i = 1; i < elementsLength; i += 2) {\n    var val = this.elements[i]\n    sumOfSquares += val * val\n  }\n\n  return this._magnitude = Math.sqrt(sumOfSquares)\n}\n\n/**\n * Calculates the dot product of this vector and another vector.\n *\n * @param {lunr.Vector} otherVector - The vector to compute the dot product with.\n * @returns {Number}\n */\nlunr.Vector.prototype.dot = function (otherVector) {\n  var dotProduct = 0,\n      a = this.elements, b = otherVector.elements,\n      aLen = a.length, bLen = b.length,\n      aVal = 0, bVal = 0,\n      i = 0, j = 0\n\n  while (i < aLen && j < bLen) {\n    aVal = a[i], bVal = b[j]\n    if (aVal < bVal) {\n      i += 2\n    } else if (aVal > bVal) {\n      j += 2\n    } else if (aVal == bVal) {\n      dotProduct += a[i + 1] * b[j + 1]\n      i += 2\n      j += 2\n    }\n  }\n\n  return dotProduct\n}\n\n/**\n * Calculates the similarity between this vector and another vector.\n *\n * @param {lunr.Vector} otherVector - The other vector to calculate the\n * similarity with.\n * @returns {Number}\n */\nlunr.Vector.prototype.similarity = function (otherVector) {\n  return this.dot(otherVector) / this.magnitude() || 0\n}\n\n/**\n * Converts the vector to an array of the elements within the vector.\n *\n * @returns {Number[]}\n */\nlunr.Vector.prototype.toArray = function () {\n  var output = new Array (this.elements.length / 2)\n\n  for (var i = 1, j = 0; i < this.elements.length; i += 2, j++) {\n    output[j] = this.elements[i]\n  }\n\n  return output\n}\n\n/**\n * A JSON serializable representation of the vector.\n *\n * @returns {Number[]}\n */\nlunr.Vector.prototype.toJSON = function () {\n  return this.elements\n}\n/* eslint-disable */\n/*!\n * lunr.stemmer\n * Copyright (C) 2020 Oliver Nightingale\n * Includes code from - http://tartarus.org/~martin/PorterStemmer/js.txt\n */\n\n/**\n * lunr.stemmer is an english language stemmer, this is a JavaScript\n * implementation of the PorterStemmer taken from http://tartarus.org/~martin\n *\n * @static\n * @implements {lunr.PipelineFunction}\n * @param {lunr.Token} token - The string to stem\n * @returns {lunr.Token}\n * @see {@link lunr.Pipeline}\n * @function\n */\nlunr.stemmer = (function(){\n  var step2list = {\n      \"ational\" : \"ate\",\n      \"tional\" : \"tion\",\n      \"enci\" : \"ence\",\n      \"anci\" : \"ance\",\n      \"izer\" : \"ize\",\n      \"bli\" : \"ble\",\n      \"alli\" : \"al\",\n      \"entli\" : \"ent\",\n      \"eli\" : \"e\",\n      \"ousli\" : \"ous\",\n      \"ization\" : \"ize\",\n      \"ation\" : \"ate\",\n      \"ator\" : \"ate\",\n      \"alism\" : \"al\",\n      \"iveness\" : \"ive\",\n      \"fulness\" : \"ful\",\n      \"ousness\" : \"ous\",\n      \"aliti\" : \"al\",\n      \"iviti\" : \"ive\",\n      \"biliti\" : \"ble\",\n      \"logi\" : \"log\"\n    },\n\n    step3list = {\n      \"icate\" : \"ic\",\n      \"ative\" : \"\",\n      \"alize\" : \"al\",\n      \"iciti\" : \"ic\",\n      \"ical\" : \"ic\",\n      \"ful\" : \"\",\n      \"ness\" : \"\"\n    },\n\n    c = \"[^aeiou]\",          // consonant\n    v = \"[aeiouy]\",          // vowel\n    C = c + \"[^aeiouy]*\",    // consonant sequence\n    V = v + \"[aeiou]*\",      // vowel sequence\n\n    mgr0 = \"^(\" + C + \")?\" + V + C,               // [C]VC... is m>0\n    meq1 = \"^(\" + C + \")?\" + V + C + \"(\" + V + \")?$\",  // [C]VC[V] is m=1\n    mgr1 = \"^(\" + C + \")?\" + V + C + V + C,       // [C]VCVC... is m>1\n    s_v = \"^(\" + C + \")?\" + v;                   // vowel in stem\n\n  var re_mgr0 = new RegExp(mgr0);\n  var re_mgr1 = new RegExp(mgr1);\n  var re_meq1 = new RegExp(meq1);\n  var re_s_v = new RegExp(s_v);\n\n  var re_1a = /^(.+?)(ss|i)es$/;\n  var re2_1a = /^(.+?)([^s])s$/;\n  var re_1b = /^(.+?)eed$/;\n  var re2_1b = /^(.+?)(ed|ing)$/;\n  var re_1b_2 = /.$/;\n  var re2_1b_2 = /(at|bl|iz)$/;\n  var re3_1b_2 = new RegExp(\"([^aeiouylsz])\\\\1$\");\n  var re4_1b_2 = new RegExp(\"^\" + C + v + \"[^aeiouwxy]$\");\n\n  var re_1c = /^(.+?[^aeiou])y$/;\n  var re_2 = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;\n\n  var re_3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;\n\n  var re_4 = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;\n  var re2_4 = /^(.+?)(s|t)(ion)$/;\n\n  var re_5 = /^(.+?)e$/;\n  var re_5_1 = /ll$/;\n  var re3_5 = new RegExp(\"^\" + C + v + \"[^aeiouwxy]$\");\n\n  var porterStemmer = function porterStemmer(w) {\n    var stem,\n      suffix,\n      firstch,\n      re,\n      re2,\n      re3,\n      re4;\n\n    if (w.length < 3) { return w; }\n\n    firstch = w.substr(0,1);\n    if (firstch == \"y\") {\n      w = firstch.toUpperCase() + w.substr(1);\n    }\n\n    // Step 1a\n    re = re_1a\n    re2 = re2_1a;\n\n    if (re.test(w)) { w = w.replace(re,\"$1$2\"); }\n    else if (re2.test(w)) { w = w.replace(re2,\"$1$2\"); }\n\n    // Step 1b\n    re = re_1b;\n    re2 = re2_1b;\n    if (re.test(w)) {\n      var fp = re.exec(w);\n      re = re_mgr0;\n      if (re.test(fp[1])) {\n        re = re_1b_2;\n        w = w.replace(re,\"\");\n      }\n    } else if (re2.test(w)) {\n      var fp = re2.exec(w);\n      stem = fp[1];\n      re2 = re_s_v;\n      if (re2.test(stem)) {\n        w = stem;\n        re2 = re2_1b_2;\n        re3 = re3_1b_2;\n        re4 = re4_1b_2;\n        if (re2.test(w)) { w = w + \"e\"; }\n        else if (re3.test(w)) { re = re_1b_2; w = w.replace(re,\"\"); }\n        else if (re4.test(w)) { w = w + \"e\"; }\n      }\n    }\n\n    // Step 1c - replace suffix y or Y by i if preceded by a non-vowel which is not the first letter of the word (so cry -> cri, by -> by, say -> say)\n    re = re_1c;\n    if (re.test(w)) {\n      var fp = re.exec(w);\n      stem = fp[1];\n      w = stem + \"i\";\n    }\n\n    // Step 2\n    re = re_2;\n    if (re.test(w)) {\n      var fp = re.exec(w);\n      stem = fp[1];\n      suffix = fp[2];\n      re = re_mgr0;\n      if (re.test(stem)) {\n        w = stem + step2list[suffix];\n      }\n    }\n\n    // Step 3\n    re = re_3;\n    if (re.test(w)) {\n      var fp = re.exec(w);\n      stem = fp[1];\n      suffix = fp[2];\n      re = re_mgr0;\n      if (re.test(stem)) {\n        w = stem + step3list[suffix];\n      }\n    }\n\n    // Step 4\n    re = re_4;\n    re2 = re2_4;\n    if (re.test(w)) {\n      var fp = re.exec(w);\n      stem = fp[1];\n      re = re_mgr1;\n      if (re.test(stem)) {\n        w = stem;\n      }\n    } else if (re2.test(w)) {\n      var fp = re2.exec(w);\n      stem = fp[1] + fp[2];\n      re2 = re_mgr1;\n      if (re2.test(stem)) {\n        w = stem;\n      }\n    }\n\n    // Step 5\n    re = re_5;\n    if (re.test(w)) {\n      var fp = re.exec(w);\n      stem = fp[1];\n      re = re_mgr1;\n      re2 = re_meq1;\n      re3 = re3_5;\n      if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {\n        w = stem;\n      }\n    }\n\n    re = re_5_1;\n    re2 = re_mgr1;\n    if (re.test(w) && re2.test(w)) {\n      re = re_1b_2;\n      w = w.replace(re,\"\");\n    }\n\n    // and turn initial Y back to y\n\n    if (firstch == \"y\") {\n      w = firstch.toLowerCase() + w.substr(1);\n    }\n\n    return w;\n  };\n\n  return function (token) {\n    return token.update(porterStemmer);\n  }\n})();\n\nlunr.Pipeline.registerFunction(lunr.stemmer, 'stemmer')\n/*!\n * lunr.stopWordFilter\n * Copyright (C) 2020 Oliver Nightingale\n */\n\n/**\n * lunr.generateStopWordFilter builds a stopWordFilter function from the provided\n * list of stop words.\n *\n * The built in lunr.stopWordFilter is built using this generator and can be used\n * to generate custom stopWordFilters for applications or non English languages.\n *\n * @function\n * @param {Array} token The token to pass through the filter\n * @returns {lunr.PipelineFunction}\n * @see lunr.Pipeline\n * @see lunr.stopWordFilter\n */\nlunr.generateStopWordFilter = function (stopWords) {\n  var words = stopWords.reduce(function (memo, stopWord) {\n    memo[stopWord] = stopWord\n    return memo\n  }, {})\n\n  return function (token) {\n    if (token && words[token.toString()] !== token.toString()) return token\n  }\n}\n\n/**\n * lunr.stopWordFilter is an English language stop word list filter, any words\n * contained in the list will not be passed through the filter.\n *\n * This is intended to be used in the Pipeline. If the token does not pass the\n * filter then undefined will be returned.\n *\n * @function\n * @implements {lunr.PipelineFunction}\n * @params {lunr.Token} token - A token to check for being a stop word.\n * @returns {lunr.Token}\n * @see {@link lunr.Pipeline}\n */\nlunr.stopWordFilter = lunr.generateStopWordFilter([\n  'a',\n  'able',\n  'about',\n  'across',\n  'after',\n  'all',\n  'almost',\n  'also',\n  'am',\n  'among',\n  'an',\n  'and',\n  'any',\n  'are',\n  'as',\n  'at',\n  'be',\n  'because',\n  'been',\n  'but',\n  'by',\n  'can',\n  'cannot',\n  'could',\n  'dear',\n  'did',\n  'do',\n  'does',\n  'either',\n  'else',\n  'ever',\n  'every',\n  'for',\n  'from',\n  'get',\n  'got',\n  'had',\n  'has',\n  'have',\n  'he',\n  'her',\n  'hers',\n  'him',\n  'his',\n  'how',\n  'however',\n  'i',\n  'if',\n  'in',\n  'into',\n  'is',\n  'it',\n  'its',\n  'just',\n  'least',\n  'let',\n  'like',\n  'likely',\n  'may',\n  'me',\n  'might',\n  'most',\n  'must',\n  'my',\n  'neither',\n  'no',\n  'nor',\n  'not',\n  'of',\n  'off',\n  'often',\n  'on',\n  'only',\n  'or',\n  'other',\n  'our',\n  'own',\n  'rather',\n  'said',\n  'say',\n  'says',\n  'she',\n  'should',\n  'since',\n  'so',\n  'some',\n  'than',\n  'that',\n  'the',\n  'their',\n  'them',\n  'then',\n  'there',\n  'these',\n  'they',\n  'this',\n  'tis',\n  'to',\n  'too',\n  'twas',\n  'us',\n  'wants',\n  'was',\n  'we',\n  'were',\n  'what',\n  'when',\n  'where',\n  'which',\n  'while',\n  'who',\n  'whom',\n  'why',\n  'will',\n  'with',\n  'would',\n  'yet',\n  'you',\n  'your'\n])\n\nlunr.Pipeline.registerFunction(lunr.stopWordFilter, 'stopWordFilter')\n/*!\n * lunr.trimmer\n * Copyright (C) 2020 Oliver Nightingale\n */\n\n/**\n * lunr.trimmer is a pipeline function for trimming non word\n * characters from the beginning and end of tokens before they\n * enter the index.\n *\n * This implementation may not work correctly for non latin\n * characters and should either be removed or adapted for use\n * with languages with non-latin characters.\n *\n * @static\n * @implements {lunr.PipelineFunction}\n * @param {lunr.Token} token The token to pass through the filter\n * @returns {lunr.Token}\n * @see lunr.Pipeline\n */\nlunr.trimmer = function (token) {\n  return token.update(function (s) {\n    return s.replace(/^\\W+/, '').replace(/\\W+$/, '')\n  })\n}\n\nlunr.Pipeline.registerFunction(lunr.trimmer, 'trimmer')\n/*!\n * lunr.TokenSet\n * Copyright (C) 2020 Oliver Nightingale\n */\n\n/**\n * A token set is used to store the unique list of all tokens\n * within an index. Token sets are also used to represent an\n * incoming query to the index, this query token set and index\n * token set are then intersected to find which tokens to look\n * up in the inverted index.\n *\n * A token set can hold multiple tokens, as in the case of the\n * index token set, or it can hold a single token as in the\n * case of a simple query token set.\n *\n * Additionally token sets are used to perform wildcard matching.\n * Leading, contained and trailing wildcards are supported, and\n * from this edit distance matching can also be provided.\n *\n * Token sets are implemented as a minimal finite state automata,\n * where both common prefixes and suffixes are shared between tokens.\n * This helps to reduce the space used for storing the token set.\n *\n * @constructor\n */\nlunr.TokenSet = function () {\n  this.final = false\n  this.edges = {}\n  this.id = lunr.TokenSet._nextId\n  lunr.TokenSet._nextId += 1\n}\n\n/**\n * Keeps track of the next, auto increment, identifier to assign\n * to a new tokenSet.\n *\n * TokenSets require a unique identifier to be correctly minimised.\n *\n * @private\n */\nlunr.TokenSet._nextId = 1\n\n/**\n * Creates a TokenSet instance from the given sorted array of words.\n *\n * @param {String[]} arr - A sorted array of strings to create the set from.\n * @returns {lunr.TokenSet}\n * @throws Will throw an error if the input array is not sorted.\n */\nlunr.TokenSet.fromArray = function (arr) {\n  var builder = new lunr.TokenSet.Builder\n\n  for (var i = 0, len = arr.length; i < len; i++) {\n    builder.insert(arr[i])\n  }\n\n  builder.finish()\n  return builder.root\n}\n\n/**\n * Creates a token set from a query clause.\n *\n * @private\n * @param {Object} clause - A single clause from lunr.Query.\n * @param {string} clause.term - The query clause term.\n * @param {number} [clause.editDistance] - The optional edit distance for the term.\n * @returns {lunr.TokenSet}\n */\nlunr.TokenSet.fromClause = function (clause) {\n  if ('editDistance' in clause) {\n    return lunr.TokenSet.fromFuzzyString(clause.term, clause.editDistance)\n  } else {\n    return lunr.TokenSet.fromString(clause.term)\n  }\n}\n\n/**\n * Creates a token set representing a single string with a specified\n * edit distance.\n *\n * Insertions, deletions, substitutions and transpositions are each\n * treated as an edit distance of 1.\n *\n * Increasing the allowed edit distance will have a dramatic impact\n * on the performance of both creating and intersecting these TokenSets.\n * It is advised to keep the edit distance less than 3.\n *\n * @param {string} str - The string to create the token set from.\n * @param {number} editDistance - The allowed edit distance to match.\n * @returns {lunr.Vector}\n */\nlunr.TokenSet.fromFuzzyString = function (str, editDistance) {\n  var root = new lunr.TokenSet\n\n  var stack = [{\n    node: root,\n    editsRemaining: editDistance,\n    str: str\n  }]\n\n  while (stack.length) {\n    var frame = stack.pop()\n\n    // no edit\n    if (frame.str.length > 0) {\n      var char = frame.str.charAt(0),\n          noEditNode\n\n      if (char in frame.node.edges) {\n        noEditNode = frame.node.edges[char]\n      } else {\n        noEditNode = new lunr.TokenSet\n        frame.node.edges[char] = noEditNode\n      }\n\n      if (frame.str.length == 1) {\n        noEditNode.final = true\n      }\n\n      stack.push({\n        node: noEditNode,\n        editsRemaining: frame.editsRemaining,\n        str: frame.str.slice(1)\n      })\n    }\n\n    if (frame.editsRemaining == 0) {\n      continue\n    }\n\n    // insertion\n    if (\"*\" in frame.node.edges) {\n      var insertionNode = frame.node.edges[\"*\"]\n    } else {\n      var insertionNode = new lunr.TokenSet\n      frame.node.edges[\"*\"] = insertionNode\n    }\n\n    if (frame.str.length == 0) {\n      insertionNode.final = true\n    }\n\n    stack.push({\n      node: insertionNode,\n      editsRemaining: frame.editsRemaining - 1,\n      str: frame.str\n    })\n\n    // deletion\n    // can only do a deletion if we have enough edits remaining\n    // and if there are characters left to delete in the string\n    if (frame.str.length > 1) {\n      stack.push({\n        node: frame.node,\n        editsRemaining: frame.editsRemaining - 1,\n        str: frame.str.slice(1)\n      })\n    }\n\n    // deletion\n    // just removing the last character from the str\n    if (frame.str.length == 1) {\n      frame.node.final = true\n    }\n\n    // substitution\n    // can only do a substitution if we have enough edits remaining\n    // and if there are characters left to substitute\n    if (frame.str.length >= 1) {\n      if (\"*\" in frame.node.edges) {\n        var substitutionNode = frame.node.edges[\"*\"]\n      } else {\n        var substitutionNode = new lunr.TokenSet\n        frame.node.edges[\"*\"] = substitutionNode\n      }\n\n      if (frame.str.length == 1) {\n        substitutionNode.final = true\n      }\n\n      stack.push({\n        node: substitutionNode,\n        editsRemaining: frame.editsRemaining - 1,\n        str: frame.str.slice(1)\n      })\n    }\n\n    // transposition\n    // can only do a transposition if there are edits remaining\n    // and there are enough characters to transpose\n    if (frame.str.length > 1) {\n      var charA = frame.str.charAt(0),\n          charB = frame.str.charAt(1),\n          transposeNode\n\n      if (charB in frame.node.edges) {\n        transposeNode = frame.node.edges[charB]\n      } else {\n        transposeNode = new lunr.TokenSet\n        frame.node.edges[charB] = transposeNode\n      }\n\n      if (frame.str.length == 1) {\n        transposeNode.final = true\n      }\n\n      stack.push({\n        node: transposeNode,\n        editsRemaining: frame.editsRemaining - 1,\n        str: charA + frame.str.slice(2)\n      })\n    }\n  }\n\n  return root\n}\n\n/**\n * Creates a TokenSet from a string.\n *\n * The string may contain one or more wildcard characters (*)\n * that will allow wildcard matching when intersecting with\n * another TokenSet.\n *\n * @param {string} str - The string to create a TokenSet from.\n * @returns {lunr.TokenSet}\n */\nlunr.TokenSet.fromString = function (str) {\n  var node = new lunr.TokenSet,\n      root = node\n\n  /*\n   * Iterates through all characters within the passed string\n   * appending a node for each character.\n   *\n   * When a wildcard character is found then a self\n   * referencing edge is introduced to continually match\n   * any number of any characters.\n   */\n  for (var i = 0, len = str.length; i < len; i++) {\n    var char = str[i],\n        final = (i == len - 1)\n\n    if (char == \"*\") {\n      node.edges[char] = node\n      node.final = final\n\n    } else {\n      var next = new lunr.TokenSet\n      next.final = final\n\n      node.edges[char] = next\n      node = next\n    }\n  }\n\n  return root\n}\n\n/**\n * Converts this TokenSet into an array of strings\n * contained within the TokenSet.\n *\n * This is not intended to be used on a TokenSet that\n * contains wildcards, in these cases the results are\n * undefined and are likely to cause an infinite loop.\n *\n * @returns {string[]}\n */\nlunr.TokenSet.prototype.toArray = function () {\n  var words = []\n\n  var stack = [{\n    prefix: \"\",\n    node: this\n  }]\n\n  while (stack.length) {\n    var frame = stack.pop(),\n        edges = Object.keys(frame.node.edges),\n        len = edges.length\n\n    if (frame.node.final) {\n      /* In Safari, at this point the prefix is sometimes corrupted, see:\n       * https://github.com/olivernn/lunr.js/issues/279 Calling any\n       * String.prototype method forces Safari to \"cast\" this string to what\n       * it's supposed to be, fixing the bug. */\n      frame.prefix.charAt(0)\n      words.push(frame.prefix)\n    }\n\n    for (var i = 0; i < len; i++) {\n      var edge = edges[i]\n\n      stack.push({\n        prefix: frame.prefix.concat(edge),\n        node: frame.node.edges[edge]\n      })\n    }\n  }\n\n  return words\n}\n\n/**\n * Generates a string representation of a TokenSet.\n *\n * This is intended to allow TokenSets to be used as keys\n * in objects, largely to aid the construction and minimisation\n * of a TokenSet. As such it is not designed to be a human\n * friendly representation of the TokenSet.\n *\n * @returns {string}\n */\nlunr.TokenSet.prototype.toString = function () {\n  // NOTE: Using Object.keys here as this.edges is very likely\n  // to enter 'hash-mode' with many keys being added\n  //\n  // avoiding a for-in loop here as it leads to the function\n  // being de-optimised (at least in V8). From some simple\n  // benchmarks the performance is comparable, but allowing\n  // V8 to optimize may mean easy performance wins in the future.\n\n  if (this._str) {\n    return this._str\n  }\n\n  var str = this.final ? '1' : '0',\n      labels = Object.keys(this.edges).sort(),\n      len = labels.length\n\n  for (var i = 0; i < len; i++) {\n    var label = labels[i],\n        node = this.edges[label]\n\n    str = str + label + node.id\n  }\n\n  return str\n}\n\n/**\n * Returns a new TokenSet that is the intersection of\n * this TokenSet and the passed TokenSet.\n *\n * This intersection will take into account any wildcards\n * contained within the TokenSet.\n *\n * @param {lunr.TokenSet} b - An other TokenSet to intersect with.\n * @returns {lunr.TokenSet}\n */\nlunr.TokenSet.prototype.intersect = function (b) {\n  var output = new lunr.TokenSet,\n      frame = undefined\n\n  var stack = [{\n    qNode: b,\n    output: output,\n    node: this\n  }]\n\n  while (stack.length) {\n    frame = stack.pop()\n\n    // NOTE: As with the #toString method, we are using\n    // Object.keys and a for loop instead of a for-in loop\n    // as both of these objects enter 'hash' mode, causing\n    // the function to be de-optimised in V8\n    var qEdges = Object.keys(frame.qNode.edges),\n        qLen = qEdges.length,\n        nEdges = Object.keys(frame.node.edges),\n        nLen = nEdges.length\n\n    for (var q = 0; q < qLen; q++) {\n      var qEdge = qEdges[q]\n\n      for (var n = 0; n < nLen; n++) {\n        var nEdge = nEdges[n]\n\n        if (nEdge == qEdge || qEdge == '*') {\n          var node = frame.node.edges[nEdge],\n              qNode = frame.qNode.edges[qEdge],\n              final = node.final && qNode.final,\n              next = undefined\n\n          if (nEdge in frame.output.edges) {\n            // an edge already exists for this character\n            // no need to create a new node, just set the finality\n            // bit unless this node is already final\n            next = frame.output.edges[nEdge]\n            next.final = next.final || final\n\n          } else {\n            // no edge exists yet, must create one\n            // set the finality bit and insert it\n            // into the output\n            next = new lunr.TokenSet\n            next.final = final\n            frame.output.edges[nEdge] = next\n          }\n\n          stack.push({\n            qNode: qNode,\n            output: next,\n            node: node\n          })\n        }\n      }\n    }\n  }\n\n  return output\n}\nlunr.TokenSet.Builder = function () {\n  this.previousWord = \"\"\n  this.root = new lunr.TokenSet\n  this.uncheckedNodes = []\n  this.minimizedNodes = {}\n}\n\nlunr.TokenSet.Builder.prototype.insert = function (word) {\n  var node,\n      commonPrefix = 0\n\n  if (word < this.previousWord) {\n    throw new Error (\"Out of order word insertion\")\n  }\n\n  for (var i = 0; i < word.length && i < this.previousWord.length; i++) {\n    if (word[i] != this.previousWord[i]) break\n    commonPrefix++\n  }\n\n  this.minimize(commonPrefix)\n\n  if (this.uncheckedNodes.length == 0) {\n    node = this.root\n  } else {\n    node = this.uncheckedNodes[this.uncheckedNodes.length - 1].child\n  }\n\n  for (var i = commonPrefix; i < word.length; i++) {\n    var nextNode = new lunr.TokenSet,\n        char = word[i]\n\n    node.edges[char] = nextNode\n\n    this.uncheckedNodes.push({\n      parent: node,\n      char: char,\n      child: nextNode\n    })\n\n    node = nextNode\n  }\n\n  node.final = true\n  this.previousWord = word\n}\n\nlunr.TokenSet.Builder.prototype.finish = function () {\n  this.minimize(0)\n}\n\nlunr.TokenSet.Builder.prototype.minimize = function (downTo) {\n  for (var i = this.uncheckedNodes.length - 1; i >= downTo; i--) {\n    var node = this.uncheckedNodes[i],\n        childKey = node.child.toString()\n\n    if (childKey in this.minimizedNodes) {\n      node.parent.edges[node.char] = this.minimizedNodes[childKey]\n    } else {\n      // Cache the key for this node since\n      // we know it can't change anymore\n      node.child._str = childKey\n\n      this.minimizedNodes[childKey] = node.child\n    }\n\n    this.uncheckedNodes.pop()\n  }\n}\n/*!\n * lunr.Index\n * Copyright (C) 2020 Oliver Nightingale\n */\n\n/**\n * An index contains the built index of all documents and provides a query interface\n * to the index.\n *\n * Usually instances of lunr.Index will not be created using this constructor, instead\n * lunr.Builder should be used to construct new indexes, or lunr.Index.load should be\n * used to load previously built and serialized indexes.\n *\n * @constructor\n * @param {Object} attrs - The attributes of the built search index.\n * @param {Object} attrs.invertedIndex - An index of term/field to document reference.\n * @param {Object<string, lunr.Vector>} attrs.fieldVectors - Field vectors\n * @param {lunr.TokenSet} attrs.tokenSet - An set of all corpus tokens.\n * @param {string[]} attrs.fields - The names of indexed document fields.\n * @param {lunr.Pipeline} attrs.pipeline - The pipeline to use for search terms.\n */\nlunr.Index = function (attrs) {\n  this.invertedIndex = attrs.invertedIndex\n  this.fieldVectors = attrs.fieldVectors\n  this.tokenSet = attrs.tokenSet\n  this.fields = attrs.fields\n  this.pipeline = attrs.pipeline\n}\n\n/**\n * A result contains details of a document matching a search query.\n * @typedef {Object} lunr.Index~Result\n * @property {string} ref - The reference of the document this result represents.\n * @property {number} score - A number between 0 and 1 representing how similar this document is to the query.\n * @property {lunr.MatchData} matchData - Contains metadata about this match including which term(s) caused the match.\n */\n\n/**\n * Although lunr provides the ability to create queries using lunr.Query, it also provides a simple\n * query language which itself is parsed into an instance of lunr.Query.\n *\n * For programmatically building queries it is advised to directly use lunr.Query, the query language\n * is best used for human entered text rather than program generated text.\n *\n * At its simplest queries can just be a single term, e.g. `hello`, multiple terms are also supported\n * and will be combined with OR, e.g `hello world` will match documents that contain either 'hello'\n * or 'world', though those that contain both will rank higher in the results.\n *\n * Wildcards can be included in terms to match one or more unspecified characters, these wildcards can\n * be inserted anywhere within the term, and more than one wildcard can exist in a single term. Adding\n * wildcards will increase the number of documents that will be found but can also have a negative\n * impact on query performance, especially with wildcards at the beginning of a term.\n *\n * Terms can be restricted to specific fields, e.g. `title:hello`, only documents with the term\n * hello in the title field will match this query. Using a field not present in the index will lead\n * to an error being thrown.\n *\n * Modifiers can also be added to terms, lunr supports edit distance and boost modifiers on terms. A term\n * boost will make documents matching that term score higher, e.g. `foo^5`. Edit distance is also supported\n * to provide fuzzy matching, e.g. 'hello~2' will match documents with hello with an edit distance of 2.\n * Avoid large values for edit distance to improve query performance.\n *\n * Each term also supports a presence modifier. By default a term's presence in document is optional, however\n * this can be changed to either required or prohibited. For a term's presence to be required in a document the\n * term should be prefixed with a '+', e.g. `+foo bar` is a search for documents that must contain 'foo' and\n * optionally contain 'bar'. Conversely a leading '-' sets the terms presence to prohibited, i.e. it must not\n * appear in a document, e.g. `-foo bar` is a search for documents that do not contain 'foo' but may contain 'bar'.\n *\n * To escape special characters the backslash character '\\' can be used, this allows searches to include\n * characters that would normally be considered modifiers, e.g. `foo\\~2` will search for a term \"foo~2\" instead\n * of attempting to apply a boost of 2 to the search term \"foo\".\n *\n * @typedef {string} lunr.Index~QueryString\n * @example <caption>Simple single term query</caption>\n * hello\n * @example <caption>Multiple term query</caption>\n * hello world\n * @example <caption>term scoped to a field</caption>\n * title:hello\n * @example <caption>term with a boost of 10</caption>\n * hello^10\n * @example <caption>term with an edit distance of 2</caption>\n * hello~2\n * @example <caption>terms with presence modifiers</caption>\n * -foo +bar baz\n */\n\n/**\n * Performs a search against the index using lunr query syntax.\n *\n * Results will be returned sorted by their score, the most relevant results\n * will be returned first.  For details on how the score is calculated, please see\n * the {@link https://lunrjs.com/guides/searching.html#scoring|guide}.\n *\n * For more programmatic querying use lunr.Index#query.\n *\n * @param {lunr.Index~QueryString} queryString - A string containing a lunr query.\n * @throws {lunr.QueryParseError} If the passed query string cannot be parsed.\n * @returns {lunr.Index~Result[]}\n */\nlunr.Index.prototype.search = function (queryString) {\n  return this.query(function (query) {\n    var parser = new lunr.QueryParser(queryString, query)\n    parser.parse()\n  })\n}\n\n/**\n * A query builder callback provides a query object to be used to express\n * the query to perform on the index.\n *\n * @callback lunr.Index~queryBuilder\n * @param {lunr.Query} query - The query object to build up.\n * @this lunr.Query\n */\n\n/**\n * Performs a query against the index using the yielded lunr.Query object.\n *\n * If performing programmatic queries against the index, this method is preferred\n * over lunr.Index#search so as to avoid the additional query parsing overhead.\n *\n * A query object is yielded to the supplied function which should be used to\n * express the query to be run against the index.\n *\n * Note that although this function takes a callback parameter it is _not_ an\n * asynchronous operation, the callback is just yielded a query object to be\n * customized.\n *\n * @param {lunr.Index~queryBuilder} fn - A function that is used to build the query.\n * @returns {lunr.Index~Result[]}\n */\nlunr.Index.prototype.query = function (fn) {\n  // for each query clause\n  // * process terms\n  // * expand terms from token set\n  // * find matching documents and metadata\n  // * get document vectors\n  // * score documents\n\n  var query = new lunr.Query(this.fields),\n      matchingFields = Object.create(null),\n      queryVectors = Object.create(null),\n      termFieldCache = Object.create(null),\n      requiredMatches = Object.create(null),\n      prohibitedMatches = Object.create(null)\n\n  /*\n   * To support field level boosts a query vector is created per\n   * field. An empty vector is eagerly created to support negated\n   * queries.\n   */\n  for (var i = 0; i < this.fields.length; i++) {\n    queryVectors[this.fields[i]] = new lunr.Vector\n  }\n\n  fn.call(query, query)\n\n  for (var i = 0; i < query.clauses.length; i++) {\n    /*\n     * Unless the pipeline has been disabled for this term, which is\n     * the case for terms with wildcards, we need to pass the clause\n     * term through the search pipeline. A pipeline returns an array\n     * of processed terms. Pipeline functions may expand the passed\n     * term, which means we may end up performing multiple index lookups\n     * for a single query term.\n     */\n    var clause = query.clauses[i],\n        terms = null,\n        clauseMatches = lunr.Set.empty\n\n    if (clause.usePipeline) {\n      terms = this.pipeline.runString(clause.term, {\n        fields: clause.fields\n      })\n    } else {\n      terms = [clause.term]\n    }\n\n    for (var m = 0; m < terms.length; m++) {\n      var term = terms[m]\n\n      /*\n       * Each term returned from the pipeline needs to use the same query\n       * clause object, e.g. the same boost and or edit distance. The\n       * simplest way to do this is to re-use the clause object but mutate\n       * its term property.\n       */\n      clause.term = term\n\n      /*\n       * From the term in the clause we create a token set which will then\n       * be used to intersect the indexes token set to get a list of terms\n       * to lookup in the inverted index\n       */\n      var termTokenSet = lunr.TokenSet.fromClause(clause),\n          expandedTerms = this.tokenSet.intersect(termTokenSet).toArray()\n\n      /*\n       * If a term marked as required does not exist in the tokenSet it is\n       * impossible for the search to return any matches. We set all the field\n       * scoped required matches set to empty and stop examining any further\n       * clauses.\n       */\n      if (expandedTerms.length === 0 && clause.presence === lunr.Query.presence.REQUIRED) {\n        for (var k = 0; k < clause.fields.length; k++) {\n          var field = clause.fields[k]\n          requiredMatches[field] = lunr.Set.empty\n        }\n\n        break\n      }\n\n      for (var j = 0; j < expandedTerms.length; j++) {\n        /*\n         * For each term get the posting and termIndex, this is required for\n         * building the query vector.\n         */\n        var expandedTerm = expandedTerms[j],\n            posting = this.invertedIndex[expandedTerm],\n            termIndex = posting._index\n\n        for (var k = 0; k < clause.fields.length; k++) {\n          /*\n           * For each field that this query term is scoped by (by default\n           * all fields are in scope) we need to get all the document refs\n           * that have this term in that field.\n           *\n           * The posting is the entry in the invertedIndex for the matching\n           * term from above.\n           */\n          var field = clause.fields[k],\n              fieldPosting = posting[field],\n              matchingDocumentRefs = Object.keys(fieldPosting),\n              termField = expandedTerm + \"/\" + field,\n              matchingDocumentsSet = new lunr.Set(matchingDocumentRefs)\n\n          /*\n           * if the presence of this term is required ensure that the matching\n           * documents are added to the set of required matches for this clause.\n           *\n           */\n          if (clause.presence == lunr.Query.presence.REQUIRED) {\n            clauseMatches = clauseMatches.union(matchingDocumentsSet)\n\n            if (requiredMatches[field] === undefined) {\n              requiredMatches[field] = lunr.Set.complete\n            }\n          }\n\n          /*\n           * if the presence of this term is prohibited ensure that the matching\n           * documents are added to the set of prohibited matches for this field,\n           * creating that set if it does not yet exist.\n           */\n          if (clause.presence == lunr.Query.presence.PROHIBITED) {\n            if (prohibitedMatches[field] === undefined) {\n              prohibitedMatches[field] = lunr.Set.empty\n            }\n\n            prohibitedMatches[field] = prohibitedMatches[field].union(matchingDocumentsSet)\n\n            /*\n             * Prohibited matches should not be part of the query vector used for\n             * similarity scoring and no metadata should be extracted so we continue\n             * to the next field\n             */\n            continue\n          }\n\n          /*\n           * The query field vector is populated using the termIndex found for\n           * the term and a unit value with the appropriate boost applied.\n           * Using upsert because there could already be an entry in the vector\n           * for the term we are working with. In that case we just add the scores\n           * together.\n           */\n          queryVectors[field].upsert(termIndex, clause.boost, function (a, b) { return a + b })\n\n          /**\n           * If we've already seen this term, field combo then we've already collected\n           * the matching documents and metadata, no need to go through all that again\n           */\n          if (termFieldCache[termField]) {\n            continue\n          }\n\n          for (var l = 0; l < matchingDocumentRefs.length; l++) {\n            /*\n             * All metadata for this term/field/document triple\n             * are then extracted and collected into an instance\n             * of lunr.MatchData ready to be returned in the query\n             * results\n             */\n            var matchingDocumentRef = matchingDocumentRefs[l],\n                matchingFieldRef = new lunr.FieldRef (matchingDocumentRef, field),\n                metadata = fieldPosting[matchingDocumentRef],\n                fieldMatch\n\n            if ((fieldMatch = matchingFields[matchingFieldRef]) === undefined) {\n              matchingFields[matchingFieldRef] = new lunr.MatchData (expandedTerm, field, metadata)\n            } else {\n              fieldMatch.add(expandedTerm, field, metadata)\n            }\n\n          }\n\n          termFieldCache[termField] = true\n        }\n      }\n    }\n\n    /**\n     * If the presence was required we need to update the requiredMatches field sets.\n     * We do this after all fields for the term have collected their matches because\n     * the clause terms presence is required in _any_ of the fields not _all_ of the\n     * fields.\n     */\n    if (clause.presence === lunr.Query.presence.REQUIRED) {\n      for (var k = 0; k < clause.fields.length; k++) {\n        var field = clause.fields[k]\n        requiredMatches[field] = requiredMatches[field].intersect(clauseMatches)\n      }\n    }\n  }\n\n  /**\n   * Need to combine the field scoped required and prohibited\n   * matching documents into a global set of required and prohibited\n   * matches\n   */\n  var allRequiredMatches = lunr.Set.complete,\n      allProhibitedMatches = lunr.Set.empty\n\n  for (var i = 0; i < this.fields.length; i++) {\n    var field = this.fields[i]\n\n    if (requiredMatches[field]) {\n      allRequiredMatches = allRequiredMatches.intersect(requiredMatches[field])\n    }\n\n    if (prohibitedMatches[field]) {\n      allProhibitedMatches = allProhibitedMatches.union(prohibitedMatches[field])\n    }\n  }\n\n  var matchingFieldRefs = Object.keys(matchingFields),\n      results = [],\n      matches = Object.create(null)\n\n  /*\n   * If the query is negated (contains only prohibited terms)\n   * we need to get _all_ fieldRefs currently existing in the\n   * index. This is only done when we know that the query is\n   * entirely prohibited terms to avoid any cost of getting all\n   * fieldRefs unnecessarily.\n   *\n   * Additionally, blank MatchData must be created to correctly\n   * populate the results.\n   */\n  if (query.isNegated()) {\n    matchingFieldRefs = Object.keys(this.fieldVectors)\n\n    for (var i = 0; i < matchingFieldRefs.length; i++) {\n      var matchingFieldRef = matchingFieldRefs[i]\n      var fieldRef = lunr.FieldRef.fromString(matchingFieldRef)\n      matchingFields[matchingFieldRef] = new lunr.MatchData\n    }\n  }\n\n  for (var i = 0; i < matchingFieldRefs.length; i++) {\n    /*\n     * Currently we have document fields that match the query, but we\n     * need to return documents. The matchData and scores are combined\n     * from multiple fields belonging to the same document.\n     *\n     * Scores are calculated by field, using the query vectors created\n     * above, and combined into a final document score using addition.\n     */\n    var fieldRef = lunr.FieldRef.fromString(matchingFieldRefs[i]),\n        docRef = fieldRef.docRef\n\n    if (!allRequiredMatches.contains(docRef)) {\n      continue\n    }\n\n    if (allProhibitedMatches.contains(docRef)) {\n      continue\n    }\n\n    var fieldVector = this.fieldVectors[fieldRef],\n        score = queryVectors[fieldRef.fieldName].similarity(fieldVector),\n        docMatch\n\n    if ((docMatch = matches[docRef]) !== undefined) {\n      docMatch.score += score\n      docMatch.matchData.combine(matchingFields[fieldRef])\n    } else {\n      var match = {\n        ref: docRef,\n        score: score,\n        matchData: matchingFields[fieldRef]\n      }\n      matches[docRef] = match\n      results.push(match)\n    }\n  }\n\n  /*\n   * Sort the results objects by score, highest first.\n   */\n  return results.sort(function (a, b) {\n    return b.score - a.score\n  })\n}\n\n/**\n * Prepares the index for JSON serialization.\n *\n * The schema for this JSON blob will be described in a\n * separate JSON schema file.\n *\n * @returns {Object}\n */\nlunr.Index.prototype.toJSON = function () {\n  var invertedIndex = Object.keys(this.invertedIndex)\n    .sort()\n    .map(function (term) {\n      return [term, this.invertedIndex[term]]\n    }, this)\n\n  var fieldVectors = Object.keys(this.fieldVectors)\n    .map(function (ref) {\n      return [ref, this.fieldVectors[ref].toJSON()]\n    }, this)\n\n  return {\n    version: lunr.version,\n    fields: this.fields,\n    fieldVectors: fieldVectors,\n    invertedIndex: invertedIndex,\n    pipeline: this.pipeline.toJSON()\n  }\n}\n\n/**\n * Loads a previously serialized lunr.Index\n *\n * @param {Object} serializedIndex - A previously serialized lunr.Index\n * @returns {lunr.Index}\n */\nlunr.Index.load = function (serializedIndex) {\n  var attrs = {},\n      fieldVectors = {},\n      serializedVectors = serializedIndex.fieldVectors,\n      invertedIndex = Object.create(null),\n      serializedInvertedIndex = serializedIndex.invertedIndex,\n      tokenSetBuilder = new lunr.TokenSet.Builder,\n      pipeline = lunr.Pipeline.load(serializedIndex.pipeline)\n\n  if (serializedIndex.version != lunr.version) {\n    lunr.utils.warn(\"Version mismatch when loading serialised index. Current version of lunr '\" + lunr.version + \"' does not match serialized index '\" + serializedIndex.version + \"'\")\n  }\n\n  for (var i = 0; i < serializedVectors.length; i++) {\n    var tuple = serializedVectors[i],\n        ref = tuple[0],\n        elements = tuple[1]\n\n    fieldVectors[ref] = new lunr.Vector(elements)\n  }\n\n  for (var i = 0; i < serializedInvertedIndex.length; i++) {\n    var tuple = serializedInvertedIndex[i],\n        term = tuple[0],\n        posting = tuple[1]\n\n    tokenSetBuilder.insert(term)\n    invertedIndex[term] = posting\n  }\n\n  tokenSetBuilder.finish()\n\n  attrs.fields = serializedIndex.fields\n\n  attrs.fieldVectors = fieldVectors\n  attrs.invertedIndex = invertedIndex\n  attrs.tokenSet = tokenSetBuilder.root\n  attrs.pipeline = pipeline\n\n  return new lunr.Index(attrs)\n}\n/*!\n * lunr.Builder\n * Copyright (C) 2020 Oliver Nightingale\n */\n\n/**\n * lunr.Builder performs indexing on a set of documents and\n * returns instances of lunr.Index ready for querying.\n *\n * All configuration of the index is done via the builder, the\n * fields to index, the document reference, the text processing\n * pipeline and document scoring parameters are all set on the\n * builder before indexing.\n *\n * @constructor\n * @property {string} _ref - Internal reference to the document reference field.\n * @property {string[]} _fields - Internal reference to the document fields to index.\n * @property {object} invertedIndex - The inverted index maps terms to document fields.\n * @property {object} documentTermFrequencies - Keeps track of document term frequencies.\n * @property {object} documentLengths - Keeps track of the length of documents added to the index.\n * @property {lunr.tokenizer} tokenizer - Function for splitting strings into tokens for indexing.\n * @property {lunr.Pipeline} pipeline - The pipeline performs text processing on tokens before indexing.\n * @property {lunr.Pipeline} searchPipeline - A pipeline for processing search terms before querying the index.\n * @property {number} documentCount - Keeps track of the total number of documents indexed.\n * @property {number} _b - A parameter to control field length normalization, setting this to 0 disabled normalization, 1 fully normalizes field lengths, the default value is 0.75.\n * @property {number} _k1 - A parameter to control how quickly an increase in term frequency results in term frequency saturation, the default value is 1.2.\n * @property {number} termIndex - A counter incremented for each unique term, used to identify a terms position in the vector space.\n * @property {array} metadataWhitelist - A list of metadata keys that have been whitelisted for entry in the index.\n */\nlunr.Builder = function () {\n  this._ref = \"id\"\n  this._fields = Object.create(null)\n  this._documents = Object.create(null)\n  this.invertedIndex = Object.create(null)\n  this.fieldTermFrequencies = {}\n  this.fieldLengths = {}\n  this.tokenizer = lunr.tokenizer\n  this.pipeline = new lunr.Pipeline\n  this.searchPipeline = new lunr.Pipeline\n  this.documentCount = 0\n  this._b = 0.75\n  this._k1 = 1.2\n  this.termIndex = 0\n  this.metadataWhitelist = []\n}\n\n/**\n * Sets the document field used as the document reference. Every document must have this field.\n * The type of this field in the document should be a string, if it is not a string it will be\n * coerced into a string by calling toString.\n *\n * The default ref is 'id'.\n *\n * The ref should _not_ be changed during indexing, it should be set before any documents are\n * added to the index. Changing it during indexing can lead to inconsistent results.\n *\n * @param {string} ref - The name of the reference field in the document.\n */\nlunr.Builder.prototype.ref = function (ref) {\n  this._ref = ref\n}\n\n/**\n * A function that is used to extract a field from a document.\n *\n * Lunr expects a field to be at the top level of a document, if however the field\n * is deeply nested within a document an extractor function can be used to extract\n * the right field for indexing.\n *\n * @callback fieldExtractor\n * @param {object} doc - The document being added to the index.\n * @returns {?(string|object|object[])} obj - The object that will be indexed for this field.\n * @example <caption>Extracting a nested field</caption>\n * function (doc) { return doc.nested.field }\n */\n\n/**\n * Adds a field to the list of document fields that will be indexed. Every document being\n * indexed should have this field. Null values for this field in indexed documents will\n * not cause errors but will limit the chance of that document being retrieved by searches.\n *\n * All fields should be added before adding documents to the index. Adding fields after\n * a document has been indexed will have no effect on already indexed documents.\n *\n * Fields can be boosted at build time. This allows terms within that field to have more\n * importance when ranking search results. Use a field boost to specify that matches within\n * one field are more important than other fields.\n *\n * @param {string} fieldName - The name of a field to index in all documents.\n * @param {object} attributes - Optional attributes associated with this field.\n * @param {number} [attributes.boost=1] - Boost applied to all terms within this field.\n * @param {fieldExtractor} [attributes.extractor] - Function to extract a field from a document.\n * @throws {RangeError} fieldName cannot contain unsupported characters '/'\n */\nlunr.Builder.prototype.field = function (fieldName, attributes) {\n  if (/\\//.test(fieldName)) {\n    throw new RangeError (\"Field '\" + fieldName + \"' contains illegal character '/'\")\n  }\n\n  this._fields[fieldName] = attributes || {}\n}\n\n/**\n * A parameter to tune the amount of field length normalisation that is applied when\n * calculating relevance scores. A value of 0 will completely disable any normalisation\n * and a value of 1 will fully normalise field lengths. The default is 0.75. Values of b\n * will be clamped to the range 0 - 1.\n *\n * @param {number} number - The value to set for this tuning parameter.\n */\nlunr.Builder.prototype.b = function (number) {\n  if (number < 0) {\n    this._b = 0\n  } else if (number > 1) {\n    this._b = 1\n  } else {\n    this._b = number\n  }\n}\n\n/**\n * A parameter that controls the speed at which a rise in term frequency results in term\n * frequency saturation. The default value is 1.2. Setting this to a higher value will give\n * slower saturation levels, a lower value will result in quicker saturation.\n *\n * @param {number} number - The value to set for this tuning parameter.\n */\nlunr.Builder.prototype.k1 = function (number) {\n  this._k1 = number\n}\n\n/**\n * Adds a document to the index.\n *\n * Before adding fields to the index the index should have been fully setup, with the document\n * ref and all fields to index already having been specified.\n *\n * The document must have a field name as specified by the ref (by default this is 'id') and\n * it should have all fields defined for indexing, though null or undefined values will not\n * cause errors.\n *\n * Entire documents can be boosted at build time. Applying a boost to a document indicates that\n * this document should rank higher in search results than other documents.\n *\n * @param {object} doc - The document to add to the index.\n * @param {object} attributes - Optional attributes associated with this document.\n * @param {number} [attributes.boost=1] - Boost applied to all terms within this document.\n */\nlunr.Builder.prototype.add = function (doc, attributes) {\n  var docRef = doc[this._ref],\n      fields = Object.keys(this._fields)\n\n  this._documents[docRef] = attributes || {}\n  this.documentCount += 1\n\n  for (var i = 0; i < fields.length; i++) {\n    var fieldName = fields[i],\n        extractor = this._fields[fieldName].extractor,\n        field = extractor ? extractor(doc) : doc[fieldName],\n        tokens = this.tokenizer(field, {\n          fields: [fieldName]\n        }),\n        terms = this.pipeline.run(tokens),\n        fieldRef = new lunr.FieldRef (docRef, fieldName),\n        fieldTerms = Object.create(null)\n\n    this.fieldTermFrequencies[fieldRef] = fieldTerms\n    this.fieldLengths[fieldRef] = 0\n\n    // store the length of this field for this document\n    this.fieldLengths[fieldRef] += terms.length\n\n    // calculate term frequencies for this field\n    for (var j = 0; j < terms.length; j++) {\n      var term = terms[j]\n\n      if (fieldTerms[term] == undefined) {\n        fieldTerms[term] = 0\n      }\n\n      fieldTerms[term] += 1\n\n      // add to inverted index\n      // create an initial posting if one doesn't exist\n      if (this.invertedIndex[term] == undefined) {\n        var posting = Object.create(null)\n        posting[\"_index\"] = this.termIndex\n        this.termIndex += 1\n\n        for (var k = 0; k < fields.length; k++) {\n          posting[fields[k]] = Object.create(null)\n        }\n\n        this.invertedIndex[term] = posting\n      }\n\n      // add an entry for this term/fieldName/docRef to the invertedIndex\n      if (this.invertedIndex[term][fieldName][docRef] == undefined) {\n        this.invertedIndex[term][fieldName][docRef] = Object.create(null)\n      }\n\n      // store all whitelisted metadata about this token in the\n      // inverted index\n      for (var l = 0; l < this.metadataWhitelist.length; l++) {\n        var metadataKey = this.metadataWhitelist[l],\n            metadata = term.metadata[metadataKey]\n\n        if (this.invertedIndex[term][fieldName][docRef][metadataKey] == undefined) {\n          this.invertedIndex[term][fieldName][docRef][metadataKey] = []\n        }\n\n        this.invertedIndex[term][fieldName][docRef][metadataKey].push(metadata)\n      }\n    }\n\n  }\n}\n\n/**\n * Calculates the average document length for this index\n *\n * @private\n */\nlunr.Builder.prototype.calculateAverageFieldLengths = function () {\n\n  var fieldRefs = Object.keys(this.fieldLengths),\n      numberOfFields = fieldRefs.length,\n      accumulator = {},\n      documentsWithField = {}\n\n  for (var i = 0; i < numberOfFields; i++) {\n    var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]),\n        field = fieldRef.fieldName\n\n    documentsWithField[field] || (documentsWithField[field] = 0)\n    documentsWithField[field] += 1\n\n    accumulator[field] || (accumulator[field] = 0)\n    accumulator[field] += this.fieldLengths[fieldRef]\n  }\n\n  var fields = Object.keys(this._fields)\n\n  for (var i = 0; i < fields.length; i++) {\n    var fieldName = fields[i]\n    accumulator[fieldName] = accumulator[fieldName] / documentsWithField[fieldName]\n  }\n\n  this.averageFieldLength = accumulator\n}\n\n/**\n * Builds a vector space model of every document using lunr.Vector\n *\n * @private\n */\nlunr.Builder.prototype.createFieldVectors = function () {\n  var fieldVectors = {},\n      fieldRefs = Object.keys(this.fieldTermFrequencies),\n      fieldRefsLength = fieldRefs.length,\n      termIdfCache = Object.create(null)\n\n  for (var i = 0; i < fieldRefsLength; i++) {\n    var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]),\n        fieldName = fieldRef.fieldName,\n        fieldLength = this.fieldLengths[fieldRef],\n        fieldVector = new lunr.Vector,\n        termFrequencies = this.fieldTermFrequencies[fieldRef],\n        terms = Object.keys(termFrequencies),\n        termsLength = terms.length\n\n\n    var fieldBoost = this._fields[fieldName].boost || 1,\n        docBoost = this._documents[fieldRef.docRef].boost || 1\n\n    for (var j = 0; j < termsLength; j++) {\n      var term = terms[j],\n          tf = termFrequencies[term],\n          termIndex = this.invertedIndex[term]._index,\n          idf, score, scoreWithPrecision\n\n      if (termIdfCache[term] === undefined) {\n        idf = lunr.idf(this.invertedIndex[term], this.documentCount)\n        termIdfCache[term] = idf\n      } else {\n        idf = termIdfCache[term]\n      }\n\n      score = idf * ((this._k1 + 1) * tf) / (this._k1 * (1 - this._b + this._b * (fieldLength / this.averageFieldLength[fieldName])) + tf)\n      score *= fieldBoost\n      score *= docBoost\n      scoreWithPrecision = Math.round(score * 1000) / 1000\n      // Converts 1.23456789 to 1.234.\n      // Reducing the precision so that the vectors take up less\n      // space when serialised. Doing it now so that they behave\n      // the same before and after serialisation. Also, this is\n      // the fastest approach to reducing a number's precision in\n      // JavaScript.\n\n      fieldVector.insert(termIndex, scoreWithPrecision)\n    }\n\n    fieldVectors[fieldRef] = fieldVector\n  }\n\n  this.fieldVectors = fieldVectors\n}\n\n/**\n * Creates a token set of all tokens in the index using lunr.TokenSet\n *\n * @private\n */\nlunr.Builder.prototype.createTokenSet = function () {\n  this.tokenSet = lunr.TokenSet.fromArray(\n    Object.keys(this.invertedIndex).sort()\n  )\n}\n\n/**\n * Builds the index, creating an instance of lunr.Index.\n *\n * This completes the indexing process and should only be called\n * once all documents have been added to the index.\n *\n * @returns {lunr.Index}\n */\nlunr.Builder.prototype.build = function () {\n  this.calculateAverageFieldLengths()\n  this.createFieldVectors()\n  this.createTokenSet()\n\n  return new lunr.Index({\n    invertedIndex: this.invertedIndex,\n    fieldVectors: this.fieldVectors,\n    tokenSet: this.tokenSet,\n    fields: Object.keys(this._fields),\n    pipeline: this.searchPipeline\n  })\n}\n\n/**\n * Applies a plugin to the index builder.\n *\n * A plugin is a function that is called with the index builder as its context.\n * Plugins can be used to customise or extend the behaviour of the index\n * in some way. A plugin is just a function, that encapsulated the custom\n * behaviour that should be applied when building the index.\n *\n * The plugin function will be called with the index builder as its argument, additional\n * arguments can also be passed when calling use. The function will be called\n * with the index builder as its context.\n *\n * @param {Function} plugin The plugin to apply.\n */\nlunr.Builder.prototype.use = function (fn) {\n  var args = Array.prototype.slice.call(arguments, 1)\n  args.unshift(this)\n  fn.apply(this, args)\n}\n/**\n * Contains and collects metadata about a matching document.\n * A single instance of lunr.MatchData is returned as part of every\n * lunr.Index~Result.\n *\n * @constructor\n * @param {string} term - The term this match data is associated with\n * @param {string} field - The field in which the term was found\n * @param {object} metadata - The metadata recorded about this term in this field\n * @property {object} metadata - A cloned collection of metadata associated with this document.\n * @see {@link lunr.Index~Result}\n */\nlunr.MatchData = function (term, field, metadata) {\n  var clonedMetadata = Object.create(null),\n      metadataKeys = Object.keys(metadata || {})\n\n  // Cloning the metadata to prevent the original\n  // being mutated during match data combination.\n  // Metadata is kept in an array within the inverted\n  // index so cloning the data can be done with\n  // Array#slice\n  for (var i = 0; i < metadataKeys.length; i++) {\n    var key = metadataKeys[i]\n    clonedMetadata[key] = metadata[key].slice()\n  }\n\n  this.metadata = Object.create(null)\n\n  if (term !== undefined) {\n    this.metadata[term] = Object.create(null)\n    this.metadata[term][field] = clonedMetadata\n  }\n}\n\n/**\n * An instance of lunr.MatchData will be created for every term that matches a\n * document. However only one instance is required in a lunr.Index~Result. This\n * method combines metadata from another instance of lunr.MatchData with this\n * objects metadata.\n *\n * @param {lunr.MatchData} otherMatchData - Another instance of match data to merge with this one.\n * @see {@link lunr.Index~Result}\n */\nlunr.MatchData.prototype.combine = function (otherMatchData) {\n  var terms = Object.keys(otherMatchData.metadata)\n\n  for (var i = 0; i < terms.length; i++) {\n    var term = terms[i],\n        fields = Object.keys(otherMatchData.metadata[term])\n\n    if (this.metadata[term] == undefined) {\n      this.metadata[term] = Object.create(null)\n    }\n\n    for (var j = 0; j < fields.length; j++) {\n      var field = fields[j],\n          keys = Object.keys(otherMatchData.metadata[term][field])\n\n      if (this.metadata[term][field] == undefined) {\n        this.metadata[term][field] = Object.create(null)\n      }\n\n      for (var k = 0; k < keys.length; k++) {\n        var key = keys[k]\n\n        if (this.metadata[term][field][key] == undefined) {\n          this.metadata[term][field][key] = otherMatchData.metadata[term][field][key]\n        } else {\n          this.metadata[term][field][key] = this.metadata[term][field][key].concat(otherMatchData.metadata[term][field][key])\n        }\n\n      }\n    }\n  }\n}\n\n/**\n * Add metadata for a term/field pair to this instance of match data.\n *\n * @param {string} term - The term this match data is associated with\n * @param {string} field - The field in which the term was found\n * @param {object} metadata - The metadata recorded about this term in this field\n */\nlunr.MatchData.prototype.add = function (term, field, metadata) {\n  if (!(term in this.metadata)) {\n    this.metadata[term] = Object.create(null)\n    this.metadata[term][field] = metadata\n    return\n  }\n\n  if (!(field in this.metadata[term])) {\n    this.metadata[term][field] = metadata\n    return\n  }\n\n  var metadataKeys = Object.keys(metadata)\n\n  for (var i = 0; i < metadataKeys.length; i++) {\n    var key = metadataKeys[i]\n\n    if (key in this.metadata[term][field]) {\n      this.metadata[term][field][key] = this.metadata[term][field][key].concat(metadata[key])\n    } else {\n      this.metadata[term][field][key] = metadata[key]\n    }\n  }\n}\n/**\n * A lunr.Query provides a programmatic way of defining queries to be performed\n * against a {@link lunr.Index}.\n *\n * Prefer constructing a lunr.Query using the {@link lunr.Index#query} method\n * so the query object is pre-initialized with the right index fields.\n *\n * @constructor\n * @property {lunr.Query~Clause[]} clauses - An array of query clauses.\n * @property {string[]} allFields - An array of all available fields in a lunr.Index.\n */\nlunr.Query = function (allFields) {\n  this.clauses = []\n  this.allFields = allFields\n}\n\n/**\n * Constants for indicating what kind of automatic wildcard insertion will be used when constructing a query clause.\n *\n * This allows wildcards to be added to the beginning and end of a term without having to manually do any string\n * concatenation.\n *\n * The wildcard constants can be bitwise combined to select both leading and trailing wildcards.\n *\n * @constant\n * @default\n * @property {number} wildcard.NONE - The term will have no wildcards inserted, this is the default behaviour\n * @property {number} wildcard.LEADING - Prepend the term with a wildcard, unless a leading wildcard already exists\n * @property {number} wildcard.TRAILING - Append a wildcard to the term, unless a trailing wildcard already exists\n * @see lunr.Query~Clause\n * @see lunr.Query#clause\n * @see lunr.Query#term\n * @example <caption>query term with trailing wildcard</caption>\n * query.term('foo', { wildcard: lunr.Query.wildcard.TRAILING })\n * @example <caption>query term with leading and trailing wildcard</caption>\n * query.term('foo', {\n *   wildcard: lunr.Query.wildcard.LEADING | lunr.Query.wildcard.TRAILING\n * })\n */\n\nlunr.Query.wildcard = new String (\"*\")\nlunr.Query.wildcard.NONE = 0\nlunr.Query.wildcard.LEADING = 1\nlunr.Query.wildcard.TRAILING = 2\n\n/**\n * Constants for indicating what kind of presence a term must have in matching documents.\n *\n * @constant\n * @enum {number}\n * @see lunr.Query~Clause\n * @see lunr.Query#clause\n * @see lunr.Query#term\n * @example <caption>query term with required presence</caption>\n * query.term('foo', { presence: lunr.Query.presence.REQUIRED })\n */\nlunr.Query.presence = {\n  /**\n   * Term's presence in a document is optional, this is the default value.\n   */\n  OPTIONAL: 1,\n\n  /**\n   * Term's presence in a document is required, documents that do not contain\n   * this term will not be returned.\n   */\n  REQUIRED: 2,\n\n  /**\n   * Term's presence in a document is prohibited, documents that do contain\n   * this term will not be returned.\n   */\n  PROHIBITED: 3\n}\n\n/**\n * A single clause in a {@link lunr.Query} contains a term and details on how to\n * match that term against a {@link lunr.Index}.\n *\n * @typedef {Object} lunr.Query~Clause\n * @property {string[]} fields - The fields in an index this clause should be matched against.\n * @property {number} [boost=1] - Any boost that should be applied when matching this clause.\n * @property {number} [editDistance] - Whether the term should have fuzzy matching applied, and how fuzzy the match should be.\n * @property {boolean} [usePipeline] - Whether the term should be passed through the search pipeline.\n * @property {number} [wildcard=lunr.Query.wildcard.NONE] - Whether the term should have wildcards appended or prepended.\n * @property {number} [presence=lunr.Query.presence.OPTIONAL] - The terms presence in any matching documents.\n */\n\n/**\n * Adds a {@link lunr.Query~Clause} to this query.\n *\n * Unless the clause contains the fields to be matched all fields will be matched. In addition\n * a default boost of 1 is applied to the clause.\n *\n * @param {lunr.Query~Clause} clause - The clause to add to this query.\n * @see lunr.Query~Clause\n * @returns {lunr.Query}\n */\nlunr.Query.prototype.clause = function (clause) {\n  if (!('fields' in clause)) {\n    clause.fields = this.allFields\n  }\n\n  if (!('boost' in clause)) {\n    clause.boost = 1\n  }\n\n  if (!('usePipeline' in clause)) {\n    clause.usePipeline = true\n  }\n\n  if (!('wildcard' in clause)) {\n    clause.wildcard = lunr.Query.wildcard.NONE\n  }\n\n  if ((clause.wildcard & lunr.Query.wildcard.LEADING) && (clause.term.charAt(0) != lunr.Query.wildcard)) {\n    clause.term = \"*\" + clause.term\n  }\n\n  if ((clause.wildcard & lunr.Query.wildcard.TRAILING) && (clause.term.slice(-1) != lunr.Query.wildcard)) {\n    clause.term = \"\" + clause.term + \"*\"\n  }\n\n  if (!('presence' in clause)) {\n    clause.presence = lunr.Query.presence.OPTIONAL\n  }\n\n  this.clauses.push(clause)\n\n  return this\n}\n\n/**\n * A negated query is one in which every clause has a presence of\n * prohibited. These queries require some special processing to return\n * the expected results.\n *\n * @returns boolean\n */\nlunr.Query.prototype.isNegated = function () {\n  for (var i = 0; i < this.clauses.length; i++) {\n    if (this.clauses[i].presence != lunr.Query.presence.PROHIBITED) {\n      return false\n    }\n  }\n\n  return true\n}\n\n/**\n * Adds a term to the current query, under the covers this will create a {@link lunr.Query~Clause}\n * to the list of clauses that make up this query.\n *\n * The term is used as is, i.e. no tokenization will be performed by this method. Instead conversion\n * to a token or token-like string should be done before calling this method.\n *\n * The term will be converted to a string by calling `toString`. Multiple terms can be passed as an\n * array, each term in the array will share the same options.\n *\n * @param {object|object[]} term - The term(s) to add to the query.\n * @param {object} [options] - Any additional properties to add to the query clause.\n * @returns {lunr.Query}\n * @see lunr.Query#clause\n * @see lunr.Query~Clause\n * @example <caption>adding a single term to a query</caption>\n * query.term(\"foo\")\n * @example <caption>adding a single term to a query and specifying search fields, term boost and automatic trailing wildcard</caption>\n * query.term(\"foo\", {\n *   fields: [\"title\"],\n *   boost: 10,\n *   wildcard: lunr.Query.wildcard.TRAILING\n * })\n * @example <caption>using lunr.tokenizer to convert a string to tokens before using them as terms</caption>\n * query.term(lunr.tokenizer(\"foo bar\"))\n */\nlunr.Query.prototype.term = function (term, options) {\n  if (Array.isArray(term)) {\n    term.forEach(function (t) { this.term(t, lunr.utils.clone(options)) }, this)\n    return this\n  }\n\n  var clause = options || {}\n  clause.term = term.toString()\n\n  this.clause(clause)\n\n  return this\n}\nlunr.QueryParseError = function (message, start, end) {\n  this.name = \"QueryParseError\"\n  this.message = message\n  this.start = start\n  this.end = end\n}\n\nlunr.QueryParseError.prototype = new Error\nlunr.QueryLexer = function (str) {\n  this.lexemes = []\n  this.str = str\n  this.length = str.length\n  this.pos = 0\n  this.start = 0\n  this.escapeCharPositions = []\n}\n\nlunr.QueryLexer.prototype.run = function () {\n  var state = lunr.QueryLexer.lexText\n\n  while (state) {\n    state = state(this)\n  }\n}\n\nlunr.QueryLexer.prototype.sliceString = function () {\n  var subSlices = [],\n      sliceStart = this.start,\n      sliceEnd = this.pos\n\n  for (var i = 0; i < this.escapeCharPositions.length; i++) {\n    sliceEnd = this.escapeCharPositions[i]\n    subSlices.push(this.str.slice(sliceStart, sliceEnd))\n    sliceStart = sliceEnd + 1\n  }\n\n  subSlices.push(this.str.slice(sliceStart, this.pos))\n  this.escapeCharPositions.length = 0\n\n  return subSlices.join('')\n}\n\nlunr.QueryLexer.prototype.emit = function (type) {\n  this.lexemes.push({\n    type: type,\n    str: this.sliceString(),\n    start: this.start,\n    end: this.pos\n  })\n\n  this.start = this.pos\n}\n\nlunr.QueryLexer.prototype.escapeCharacter = function () {\n  this.escapeCharPositions.push(this.pos - 1)\n  this.pos += 1\n}\n\nlunr.QueryLexer.prototype.next = function () {\n  if (this.pos >= this.length) {\n    return lunr.QueryLexer.EOS\n  }\n\n  var char = this.str.charAt(this.pos)\n  this.pos += 1\n  return char\n}\n\nlunr.QueryLexer.prototype.width = function () {\n  return this.pos - this.start\n}\n\nlunr.QueryLexer.prototype.ignore = function () {\n  if (this.start == this.pos) {\n    this.pos += 1\n  }\n\n  this.start = this.pos\n}\n\nlunr.QueryLexer.prototype.backup = function () {\n  this.pos -= 1\n}\n\nlunr.QueryLexer.prototype.acceptDigitRun = function () {\n  var char, charCode\n\n  do {\n    char = this.next()\n    charCode = char.charCodeAt(0)\n  } while (charCode > 47 && charCode < 58)\n\n  if (char != lunr.QueryLexer.EOS) {\n    this.backup()\n  }\n}\n\nlunr.QueryLexer.prototype.more = function () {\n  return this.pos < this.length\n}\n\nlunr.QueryLexer.EOS = 'EOS'\nlunr.QueryLexer.FIELD = 'FIELD'\nlunr.QueryLexer.TERM = 'TERM'\nlunr.QueryLexer.EDIT_DISTANCE = 'EDIT_DISTANCE'\nlunr.QueryLexer.BOOST = 'BOOST'\nlunr.QueryLexer.PRESENCE = 'PRESENCE'\n\nlunr.QueryLexer.lexField = function (lexer) {\n  lexer.backup()\n  lexer.emit(lunr.QueryLexer.FIELD)\n  lexer.ignore()\n  return lunr.QueryLexer.lexText\n}\n\nlunr.QueryLexer.lexTerm = function (lexer) {\n  if (lexer.width() > 1) {\n    lexer.backup()\n    lexer.emit(lunr.QueryLexer.TERM)\n  }\n\n  lexer.ignore()\n\n  if (lexer.more()) {\n    return lunr.QueryLexer.lexText\n  }\n}\n\nlunr.QueryLexer.lexEditDistance = function (lexer) {\n  lexer.ignore()\n  lexer.acceptDigitRun()\n  lexer.emit(lunr.QueryLexer.EDIT_DISTANCE)\n  return lunr.QueryLexer.lexText\n}\n\nlunr.QueryLexer.lexBoost = function (lexer) {\n  lexer.ignore()\n  lexer.acceptDigitRun()\n  lexer.emit(lunr.QueryLexer.BOOST)\n  return lunr.QueryLexer.lexText\n}\n\nlunr.QueryLexer.lexEOS = function (lexer) {\n  if (lexer.width() > 0) {\n    lexer.emit(lunr.QueryLexer.TERM)\n  }\n}\n\n// This matches the separator used when tokenising fields\n// within a document. These should match otherwise it is\n// not possible to search for some tokens within a document.\n//\n// It is possible for the user to change the separator on the\n// tokenizer so it _might_ clash with any other of the special\n// characters already used within the search string, e.g. :.\n//\n// This means that it is possible to change the separator in\n// such a way that makes some words unsearchable using a search\n// string.\nlunr.QueryLexer.termSeparator = lunr.tokenizer.separator\n\nlunr.QueryLexer.lexText = function (lexer) {\n  while (true) {\n    var char = lexer.next()\n\n    if (char == lunr.QueryLexer.EOS) {\n      return lunr.QueryLexer.lexEOS\n    }\n\n    // Escape character is '\\'\n    if (char.charCodeAt(0) == 92) {\n      lexer.escapeCharacter()\n      continue\n    }\n\n    if (char == \":\") {\n      return lunr.QueryLexer.lexField\n    }\n\n    if (char == \"~\") {\n      lexer.backup()\n      if (lexer.width() > 0) {\n        lexer.emit(lunr.QueryLexer.TERM)\n      }\n      return lunr.QueryLexer.lexEditDistance\n    }\n\n    if (char == \"^\") {\n      lexer.backup()\n      if (lexer.width() > 0) {\n        lexer.emit(lunr.QueryLexer.TERM)\n      }\n      return lunr.QueryLexer.lexBoost\n    }\n\n    // \"+\" indicates term presence is required\n    // checking for length to ensure that only\n    // leading \"+\" are considered\n    if (char == \"+\" && lexer.width() === 1) {\n      lexer.emit(lunr.QueryLexer.PRESENCE)\n      return lunr.QueryLexer.lexText\n    }\n\n    // \"-\" indicates term presence is prohibited\n    // checking for length to ensure that only\n    // leading \"-\" are considered\n    if (char == \"-\" && lexer.width() === 1) {\n      lexer.emit(lunr.QueryLexer.PRESENCE)\n      return lunr.QueryLexer.lexText\n    }\n\n    if (char.match(lunr.QueryLexer.termSeparator)) {\n      return lunr.QueryLexer.lexTerm\n    }\n  }\n}\n\nlunr.QueryParser = function (str, query) {\n  this.lexer = new lunr.QueryLexer (str)\n  this.query = query\n  this.currentClause = {}\n  this.lexemeIdx = 0\n}\n\nlunr.QueryParser.prototype.parse = function () {\n  this.lexer.run()\n  this.lexemes = this.lexer.lexemes\n\n  var state = lunr.QueryParser.parseClause\n\n  while (state) {\n    state = state(this)\n  }\n\n  return this.query\n}\n\nlunr.QueryParser.prototype.peekLexeme = function () {\n  return this.lexemes[this.lexemeIdx]\n}\n\nlunr.QueryParser.prototype.consumeLexeme = function () {\n  var lexeme = this.peekLexeme()\n  this.lexemeIdx += 1\n  return lexeme\n}\n\nlunr.QueryParser.prototype.nextClause = function () {\n  var completedClause = this.currentClause\n  this.query.clause(completedClause)\n  this.currentClause = {}\n}\n\nlunr.QueryParser.parseClause = function (parser) {\n  var lexeme = parser.peekLexeme()\n\n  if (lexeme == undefined) {\n    return\n  }\n\n  switch (lexeme.type) {\n    case lunr.QueryLexer.PRESENCE:\n      return lunr.QueryParser.parsePresence\n    case lunr.QueryLexer.FIELD:\n      return lunr.QueryParser.parseField\n    case lunr.QueryLexer.TERM:\n      return lunr.QueryParser.parseTerm\n    default:\n      var errorMessage = \"expected either a field or a term, found \" + lexeme.type\n\n      if (lexeme.str.length >= 1) {\n        errorMessage += \" with value '\" + lexeme.str + \"'\"\n      }\n\n      throw new lunr.QueryParseError (errorMessage, lexeme.start, lexeme.end)\n  }\n}\n\nlunr.QueryParser.parsePresence = function (parser) {\n  var lexeme = parser.consumeLexeme()\n\n  if (lexeme == undefined) {\n    return\n  }\n\n  switch (lexeme.str) {\n    case \"-\":\n      parser.currentClause.presence = lunr.Query.presence.PROHIBITED\n      break\n    case \"+\":\n      parser.currentClause.presence = lunr.Query.presence.REQUIRED\n      break\n    default:\n      var errorMessage = \"unrecognised presence operator'\" + lexeme.str + \"'\"\n      throw new lunr.QueryParseError (errorMessage, lexeme.start, lexeme.end)\n  }\n\n  var nextLexeme = parser.peekLexeme()\n\n  if (nextLexeme == undefined) {\n    var errorMessage = \"expecting term or field, found nothing\"\n    throw new lunr.QueryParseError (errorMessage, lexeme.start, lexeme.end)\n  }\n\n  switch (nextLexeme.type) {\n    case lunr.QueryLexer.FIELD:\n      return lunr.QueryParser.parseField\n    case lunr.QueryLexer.TERM:\n      return lunr.QueryParser.parseTerm\n    default:\n      var errorMessage = \"expecting term or field, found '\" + nextLexeme.type + \"'\"\n      throw new lunr.QueryParseError (errorMessage, nextLexeme.start, nextLexeme.end)\n  }\n}\n\nlunr.QueryParser.parseField = function (parser) {\n  var lexeme = parser.consumeLexeme()\n\n  if (lexeme == undefined) {\n    return\n  }\n\n  if (parser.query.allFields.indexOf(lexeme.str) == -1) {\n    var possibleFields = parser.query.allFields.map(function (f) { return \"'\" + f + \"'\" }).join(', '),\n        errorMessage = \"unrecognised field '\" + lexeme.str + \"', possible fields: \" + possibleFields\n\n    throw new lunr.QueryParseError (errorMessage, lexeme.start, lexeme.end)\n  }\n\n  parser.currentClause.fields = [lexeme.str]\n\n  var nextLexeme = parser.peekLexeme()\n\n  if (nextLexeme == undefined) {\n    var errorMessage = \"expecting term, found nothing\"\n    throw new lunr.QueryParseError (errorMessage, lexeme.start, lexeme.end)\n  }\n\n  switch (nextLexeme.type) {\n    case lunr.QueryLexer.TERM:\n      return lunr.QueryParser.parseTerm\n    default:\n      var errorMessage = \"expecting term, found '\" + nextLexeme.type + \"'\"\n      throw new lunr.QueryParseError (errorMessage, nextLexeme.start, nextLexeme.end)\n  }\n}\n\nlunr.QueryParser.parseTerm = function (parser) {\n  var lexeme = parser.consumeLexeme()\n\n  if (lexeme == undefined) {\n    return\n  }\n\n  parser.currentClause.term = lexeme.str.toLowerCase()\n\n  if (lexeme.str.indexOf(\"*\") != -1) {\n    parser.currentClause.usePipeline = false\n  }\n\n  var nextLexeme = parser.peekLexeme()\n\n  if (nextLexeme == undefined) {\n    parser.nextClause()\n    return\n  }\n\n  switch (nextLexeme.type) {\n    case lunr.QueryLexer.TERM:\n      parser.nextClause()\n      return lunr.QueryParser.parseTerm\n    case lunr.QueryLexer.FIELD:\n      parser.nextClause()\n      return lunr.QueryParser.parseField\n    case lunr.QueryLexer.EDIT_DISTANCE:\n      return lunr.QueryParser.parseEditDistance\n    case lunr.QueryLexer.BOOST:\n      return lunr.QueryParser.parseBoost\n    case lunr.QueryLexer.PRESENCE:\n      parser.nextClause()\n      return lunr.QueryParser.parsePresence\n    default:\n      var errorMessage = \"Unexpected lexeme type '\" + nextLexeme.type + \"'\"\n      throw new lunr.QueryParseError (errorMessage, nextLexeme.start, nextLexeme.end)\n  }\n}\n\nlunr.QueryParser.parseEditDistance = function (parser) {\n  var lexeme = parser.consumeLexeme()\n\n  if (lexeme == undefined) {\n    return\n  }\n\n  var editDistance = parseInt(lexeme.str, 10)\n\n  if (isNaN(editDistance)) {\n    var errorMessage = \"edit distance must be numeric\"\n    throw new lunr.QueryParseError (errorMessage, lexeme.start, lexeme.end)\n  }\n\n  parser.currentClause.editDistance = editDistance\n\n  var nextLexeme = parser.peekLexeme()\n\n  if (nextLexeme == undefined) {\n    parser.nextClause()\n    return\n  }\n\n  switch (nextLexeme.type) {\n    case lunr.QueryLexer.TERM:\n      parser.nextClause()\n      return lunr.QueryParser.parseTerm\n    case lunr.QueryLexer.FIELD:\n      parser.nextClause()\n      return lunr.QueryParser.parseField\n    case lunr.QueryLexer.EDIT_DISTANCE:\n      return lunr.QueryParser.parseEditDistance\n    case lunr.QueryLexer.BOOST:\n      return lunr.QueryParser.parseBoost\n    case lunr.QueryLexer.PRESENCE:\n      parser.nextClause()\n      return lunr.QueryParser.parsePresence\n    default:\n      var errorMessage = \"Unexpected lexeme type '\" + nextLexeme.type + \"'\"\n      throw new lunr.QueryParseError (errorMessage, nextLexeme.start, nextLexeme.end)\n  }\n}\n\nlunr.QueryParser.parseBoost = function (parser) {\n  var lexeme = parser.consumeLexeme()\n\n  if (lexeme == undefined) {\n    return\n  }\n\n  var boost = parseInt(lexeme.str, 10)\n\n  if (isNaN(boost)) {\n    var errorMessage = \"boost must be numeric\"\n    throw new lunr.QueryParseError (errorMessage, lexeme.start, lexeme.end)\n  }\n\n  parser.currentClause.boost = boost\n\n  var nextLexeme = parser.peekLexeme()\n\n  if (nextLexeme == undefined) {\n    parser.nextClause()\n    return\n  }\n\n  switch (nextLexeme.type) {\n    case lunr.QueryLexer.TERM:\n      parser.nextClause()\n      return lunr.QueryParser.parseTerm\n    case lunr.QueryLexer.FIELD:\n      parser.nextClause()\n      return lunr.QueryParser.parseField\n    case lunr.QueryLexer.EDIT_DISTANCE:\n      return lunr.QueryParser.parseEditDistance\n    case lunr.QueryLexer.BOOST:\n      return lunr.QueryParser.parseBoost\n    case lunr.QueryLexer.PRESENCE:\n      parser.nextClause()\n      return lunr.QueryParser.parsePresence\n    default:\n      var errorMessage = \"Unexpected lexeme type '\" + nextLexeme.type + \"'\"\n      throw new lunr.QueryParseError (errorMessage, nextLexeme.start, nextLexeme.end)\n  }\n}\n\n  /**\n   * export the module via AMD, CommonJS or as a browser global\n   * Export code from https://github.com/umdjs/umd/blob/master/returnExports.js\n   */\n  ;(function (root, factory) {\n    if (typeof define === 'function' && define.amd) {\n      // AMD. Register as an anonymous module.\n      define(factory)\n    } else if (typeof exports === 'object') {\n      /**\n       * Node. Does not work with strict CommonJS, but\n       * only CommonJS-like enviroments that support module.exports,\n       * like Node.\n       */\n      module.exports = factory()\n    } else {\n      // Browser globals (root is window)\n      root.lunr = factory()\n    }\n  }(this, function () {\n    /**\n     * Just return a value to define the module export.\n     * This example returns an object, but the module\n     * can return a function as the exported value.\n     */\n    return lunr\n  }))\n})();\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A RTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport lunr from \"lunr\"\n\nimport { getElement } from \"~/browser/element/_\"\nimport \"~/polyfills\"\n\nimport { Search } from \"../../_\"\nimport { SearchConfig } from \"../../config\"\nimport {\n  SearchMessage,\n  SearchMessageType\n} from \"../message\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Add support for `iframe-worker` shim\n *\n * While `importScripts` is synchronous when executed inside of a web worker,\n * it's not possible to provide a synchronous shim implementation. The cool\n * thing is that awaiting a non-Promise will convert it into a Promise, so\n * extending the type definition to return a `Promise` shouldn't break anything.\n *\n * @see https://bit.ly/2PjDnXi - GitHub comment\n *\n * @param urls - Scripts to load\n *\n * @returns Promise resolving with no result\n */\ndeclare global {\n  function importScripts(...urls: string[]): Promise<void> | void\n}\n\n/* ----------------------------------------------------------------------------\n * Data\n * ------------------------------------------------------------------------- */\n\n/**\n * Search index\n */\nlet index: Search\n\n/* ----------------------------------------------------------------------------\n * Helper functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Fetch (= import) multi-language support through `lunr-languages`\n *\n * This function automatically imports the stemmers necessary to process the\n * languages which are defined as part of the search configuration.\n *\n * If the worker runs inside of an `iframe` (when using `iframe-worker` as\n * a shim), the base URL for the stemmers to be loaded must be determined by\n * searching for the first `script` element with a `src` attribute, which will\n * contain the contents of this script.\n *\n * @param config - Search configuration\n *\n * @returns Promise resolving with no result\n */\nasync function setupSearchLanguages(\n  config: SearchConfig\n): Promise<void> {\n  let base = \"../lunr\"\n\n  /* Detect `iframe-worker` and fix base URL */\n  if (typeof parent !== \"undefined\" && \"IFrameWorker\" in parent) {\n    const worker = getElement<HTMLScriptElement>(\"script[src]\")!\n    const [path] = worker.src.split(\"/worker\")\n\n    /* Prefix base with path */\n    base = base.replace(\"..\", path)\n  }\n\n  /* Add scripts for languages */\n  const scripts = []\n  for (const lang of config.lang) {\n    switch (lang) {\n\n      /* Add segmenter for Japanese */\n      case \"ja\":\n        scripts.push(`${base}/tinyseg.js`)\n        break\n\n      /* Add segmenter for Hindi and Thai */\n      case \"hi\":\n      case \"th\":\n        scripts.push(`${base}/wordcut.js`)\n        break\n    }\n\n    /* Add language support */\n    if (lang !== \"en\")\n      scripts.push(`${base}/min/lunr.${lang}.min.js`)\n  }\n\n  /* Add multi-language support */\n  if (config.lang.length > 1)\n    scripts.push(`${base}/min/lunr.multi.min.js`)\n\n  /* Load scripts synchronously */\n  if (scripts.length)\n    await importScripts(\n      `${base}/min/lunr.stemmer.support.min.js`,\n      ...scripts\n    )\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Message handler\n *\n * @param message - Source message\n *\n * @returns Target message\n */\nexport async function handler(\n  message: SearchMessage\n): Promise<SearchMessage> {\n  switch (message.type) {\n\n    /* Search setup message */\n    case SearchMessageType.SETUP:\n      await setupSearchLanguages(message.data.config)\n      index = new Search(message.data)\n      return {\n        type: SearchMessageType.READY\n      }\n\n    /* Search query message */\n    case SearchMessageType.QUERY:\n      const query = message.data\n      try {\n        return {\n          type: SearchMessageType.RESULT,\n          data: index.search(query)\n        }\n\n      /* Return empty result in case of error */\n      } catch (err) {\n        console.warn(`Invalid query: ${query} \u2013 see https://bit.ly/2s3ChXG`)\n        console.warn(err)\n        return {\n          type: SearchMessageType.RESULT,\n          data: { items: [] }\n        }\n      }\n\n    /* All other messages */\n    default:\n      throw new TypeError(\"Invalid message type\")\n  }\n}\n\n/* ----------------------------------------------------------------------------\n * Worker\n * ------------------------------------------------------------------------- */\n\n/* Expose Lunr.js in global scope, or stemmers won't work */\nself.lunr = lunr\n\n/* Handle messages */\naddEventListener(\"message\", async ev => {\n  postMessage(await handler(ev.data))\n})\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Retrieve all elements matching the query selector\n *\n * @template T - Element type\n *\n * @param selector - Query selector\n * @param node - Node of reference\n *\n * @returns Elements\n */\nexport function getElements<T extends keyof HTMLElementTagNameMap>(\n  selector: T, node?: ParentNode\n): HTMLElementTagNameMap[T][]\n\nexport function getElements<T extends HTMLElement>(\n  selector: string, node?: ParentNode\n): T[]\n\nexport function getElements<T extends HTMLElement>(\n  selector: string, node: ParentNode = document\n): T[] {\n  return Array.from(node.querySelectorAll<T>(selector))\n}\n\n/**\n * Retrieve an element matching a query selector or throw a reference error\n *\n * Note that this function assumes that the element is present. If unsure if an\n * element is existent, use the `getOptionalElement` function instead.\n *\n * @template T - Element type\n *\n * @param selector - Query selector\n * @param node - Node of reference\n *\n * @returns Element\n */\nexport function getElement<T extends keyof HTMLElementTagNameMap>(\n  selector: T, node?: ParentNode\n): HTMLElementTagNameMap[T]\n\nexport function getElement<T extends HTMLElement>(\n  selector: string, node?: ParentNode\n): T\n\nexport function getElement<T extends HTMLElement>(\n  selector: string, node: ParentNode = document\n): T {\n  const el = getOptionalElement<T>(selector, node)\n  if (typeof el === \"undefined\")\n    throw new ReferenceError(\n      `Missing element: expected \"${selector}\" to be present`\n    )\n\n  /* Return element */\n  return el\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Retrieve an optional element matching the query selector\n *\n * @template T - Element type\n *\n * @param selector - Query selector\n * @param node - Node of reference\n *\n * @returns Element or nothing\n */\nexport function getOptionalElement<T extends keyof HTMLElementTagNameMap>(\n  selector: T, node?: ParentNode\n): HTMLElementTagNameMap[T] | undefined\n\nexport function getOptionalElement<T extends HTMLElement>(\n  selector: string, node?: ParentNode\n): T | undefined\n\nexport function getOptionalElement<T extends HTMLElement>(\n  selector: string, node: ParentNode = document\n): T | undefined {\n  return node.querySelector<T>(selector) || undefined\n}\n\n/**\n * Retrieve the currently active element\n *\n * @returns Element or nothing\n */\nexport function getActiveElement(): HTMLElement | undefined {\n  return (\n    document.activeElement?.shadowRoot?.activeElement as HTMLElement ??\n    document.activeElement as HTMLElement ??\n    undefined\n  )\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\n/* ----------------------------------------------------------------------------\n * Polyfills\n * ------------------------------------------------------------------------- */\n\n/* Polyfill `Object.entries` */\nif (!Object.entries)\n  Object.entries = function (obj: object) {\n    const data: [string, string][] = []\n    for (const key of Object.keys(obj))\n      // @ts-expect-error - ignore property access warning\n      data.push([key, obj[key]])\n\n    /* Return entries */\n    return data\n  }\n\n/* Polyfill `Object.values` */\nif (!Object.values)\n  Object.values = function (obj: object) {\n    const data: string[] = []\n    for (const key of Object.keys(obj))\n      // @ts-expect-error - ignore property access warning\n      data.push(obj[key])\n\n    /* Return values */\n    return data\n  }\n\n/* ------------------------------------------------------------------------- */\n\n/* Polyfills for `Element` */\nif (typeof Element !== \"undefined\") {\n\n  /* Polyfill `Element.scrollTo` */\n  if (!Element.prototype.scrollTo)\n    Element.prototype.scrollTo = function (\n      x?: ScrollToOptions | number, y?: number\n    ): void {\n      if (typeof x === \"object\") {\n        this.scrollLeft = x.left!\n        this.scrollTop = x.top!\n      } else {\n        this.scrollLeft = x!\n        this.scrollTop = y!\n      }\n    }\n\n  /* Polyfill `Element.replaceWith` */\n  if (!Element.prototype.replaceWith)\n    Element.prototype.replaceWith = function (\n      ...nodes: Array<string | Node>\n    ): void {\n      const parent = this.parentNode\n      if (parent) {\n        if (nodes.length === 0)\n          parent.removeChild(this)\n\n        /* Replace children and create text nodes */\n        for (let i = nodes.length - 1; i >= 0; i--) {\n          let node = nodes[i]\n          if (typeof node === \"string\")\n            node = document.createTextNode(node)\n          else if (node.parentNode)\n            node.parentNode.removeChild(node)\n\n          /* Replace child or insert before previous sibling */\n          if (!i)\n            parent.replaceChild(node, this)\n          else\n            parent.insertBefore(this.previousSibling!, node)\n        }\n      }\n    }\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Search configuration\n */\nexport interface SearchConfig {\n  lang: string[]                       /* Search languages */\n  separator: string                    /* Search separator */\n  pipeline: SearchPipelineFn[]         /* Search pipeline */\n}\n\n/**\n * Search document\n */\nexport interface SearchDocument {\n  location: string                     /* Document location */\n  title: string                        /* Document title */\n  text: string                         /* Document text */\n  tags?: string[]                      /* Document tags */\n  boost?: number                       /* Document boost */\n  parent?: SearchDocument              /* Document parent */\n}\n\n/**\n * Search options\n */\nexport interface SearchOptions {\n  suggest: boolean                     /* Search suggestions */\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Search index\n */\nexport interface SearchIndex {\n  config: SearchConfig                 /* Search configuration */\n  docs: SearchDocument[]               /* Search documents */\n  options: SearchOptions               /* Search options */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Search pipeline function\n */\ntype SearchPipelineFn =\n  | \"trimmer\"                          /* Trimmer */\n  | \"stopWordFilter\"                   /* Stop word filter */\n  | \"stemmer\"                          /* Stemmer */\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Create a search document map\n *\n * This function creates a mapping of URLs (including anchors) to the actual\n * articles and sections. It relies on the invariant that the search index is\n * ordered with the main article appearing before all sections with anchors.\n * If this is not the case, the logic music be changed.\n *\n * @param docs - Search documents\n *\n * @returns Search document map\n */\nexport function setupSearchDocumentMap(\n  docs: SearchDocument[]\n): Map<string, SearchDocument> {\n  const map = new Map<string, SearchDocument>()\n  for (const doc of docs) {\n    const [path] = doc.location.split(\"#\")\n\n    /* Add document article */\n    const article = map.get(path)\n    if (typeof article === \"undefined\") {\n      map.set(path, doc)\n\n      /* Add document section */\n    } else {\n      map.set(doc.location, doc)\n      doc.parent = article\n    }\n  }\n\n  /* Return search document map */\n  return map\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Visitor function\n *\n * @param start - Start offset\n * @param end - End offset\n */\ntype VisitorFn = (\n  start: number, end: number\n) => void\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Split a string using the given separator\n *\n * @param input - Input value\n * @param separator - Separator\n * @param fn - Visitor function\n */\nexport function split(\n  input: string, separator: RegExp, fn: VisitorFn\n): void {\n  separator = new RegExp(separator, \"g\")\n\n  /* Split string using separator */\n  let match: RegExpExecArray | null\n  let index = 0\n  do {\n    match = separator.exec(input)\n\n    /* Emit non-empty range */\n    const until = match?.index ?? input.length\n    if (index < until)\n      fn(index, until)\n\n    /* Update last index */\n    if (match) {\n      const [term] = match\n      index = match.index + term.length\n\n      /* Support zero-length lookaheads */\n      if (term.length === 0)\n        separator.lastIndex = match.index + 1\n    }\n  } while (match)\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Extraction type\n *\n * This type defines the possible values that are encoded into the first two\n * bits of a section that is part of the blocks of a tokenization table. There\n * are three types of interest: HTML opening and closing tags, as well as the\n * actual text content we need to extract for indexing.\n */\nexport const enum Extract {\n  TAG_OPEN  = 0,                       /* HTML opening tag */\n  TEXT      = 1,                       /* Text content */\n  TAG_CLOSE = 2                        /* HTML closing tag */\n}\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Visitor function\n *\n * @param block - Block index\n * @param type - Extraction type\n * @param start - Start offset\n * @param end - End offset\n */\ntype VisitorFn = (\n  block: number, type: Extract, start: number, end: number\n) => void\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Split a string into markup and text sections\n *\n * This function scans a string and divides it up into sections of markup and\n * text. For each section, it invokes the given visitor function with the block\n * index, extraction type, as well as start and end offsets. Using a visitor\n * function (= streaming data) is ideal for minimizing pressure on the GC.\n *\n * @param input - Input value\n * @param fn - Visitor function\n */\nexport function extract(\n  input: string, fn: VisitorFn\n): void {\n\n  let block = 0                        /* Current block */\n  let start = 0                        /* Current start offset */\n  let end = 0                          /* Current end offset */\n\n  /* Split string into sections */\n  for (let stack = 0; end < input.length; end++) {\n\n    /* Opening tag after non-empty section */\n    if (input.charAt(end) === \"<\" && end > start) {\n      fn(block, Extract.TEXT, start, start = end)\n\n    /* Closing tag */\n    } else if (input.charAt(end) === \">\") {\n      if (input.charAt(start + 1) === \"/\") {\n        if (--stack === 0)\n          fn(block++, Extract.TAG_CLOSE, start, end + 1)\n\n      /* Tag is not self-closing */\n      } else if (input.charAt(end - 1) !== \"/\") {\n        if (stack++ === 0)\n          fn(block, Extract.TAG_OPEN, start, end + 1)\n      }\n\n      /* New section */\n      start = end + 1\n    }\n  }\n\n  /* Add trailing section */\n  if (end > start)\n    fn(block, Extract.TEXT, start, end)\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Position table\n */\nexport type PositionTable = number[][]\n\n/**\n * Position\n */\nexport type Position = number\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Highlight all occurrences in a string\n *\n * This function receives a field's value (e.g. like `title` or `text`), it's\n * position table that was generated during indexing, and the positions found\n * when executing the query. It then highlights all occurrences, and returns\n * their concatenation. In case of multiple blocks, two are returned.\n *\n * @param input - Input value\n * @param table - Table for indexing\n * @param positions - Occurrences\n * @param full - Full results\n *\n * @returns Highlighted string value\n */\nexport function highlight(\n  input: string, table: PositionTable, positions: Position[], full = false\n): string {\n  return highlightAll([input], table, positions, full).pop()!\n}\n\n/**\n * Highlight all occurrences in a set of strings\n *\n * @param inputs - Input values\n * @param table - Table for indexing\n * @param positions - Occurrences\n * @param full - Full results\n *\n * @returns Highlighted string values\n */\nexport function highlightAll(\n  inputs: string[], table: PositionTable, positions: Position[], full = false\n): string[] {\n\n  /* Map blocks to input values */\n  const mapping = [0]\n  for (let t = 1; t < table.length; t++) {\n    const prev = table[t - 1]\n    const next = table[t]\n\n    /* Check if table points to new block */\n    const p = prev[prev.length - 1] >>> 2 & 0x3FF\n    const q = next[0]               >>> 12\n\n    /* Add block to mapping */\n    mapping.push(+(p > q) + mapping[mapping.length - 1])\n  }\n\n  /* Highlight strings one after another */\n  return inputs.map((input, i) => {\n    let cursor = 0\n\n    /* Map occurrences to blocks */\n    const blocks = new Map<number, number[]>()\n    for (const p of positions.sort((a, b) => a - b)) {\n      const index = p & 0xFFFFF\n      const block = p >>> 20\n      if (mapping[block] !== i)\n        continue\n\n      /* Ensure presence of block group */\n      let group = blocks.get(block)\n      if (typeof group === \"undefined\")\n        blocks.set(block, group = [])\n\n      /* Add index to group */\n      group.push(index)\n    }\n\n    /* Just return string, if no occurrences */\n    if (blocks.size === 0)\n      return input\n\n    /* Compute slices */\n    const slices: string[] = []\n    for (const [block, indexes] of blocks) {\n      const t = table[block]\n\n      /* Extract positions and length */\n      const start  = t[0]            >>> 12\n      const end    = t[t.length - 1] >>> 12\n      const length = t[t.length - 1] >>> 2 & 0x3FF\n\n      /* Add prefix, if full results are desired */\n      if (full && start > cursor)\n        slices.push(input.slice(cursor, start))\n\n      /* Extract and highlight slice */\n      let slice = input.slice(start, end + length)\n      for (const j of indexes.sort((a, b) => b - a)) {\n\n        /* Retrieve offset and length of match */\n        const p = (t[j] >>> 12) - start\n        const q = (t[j] >>> 2 & 0x3FF) + p\n\n        /* Wrap occurrence */\n        slice = [\n          slice.slice(0, p),\n          \"<mark>\",\n          slice.slice(p, q),\n          \"</mark>\",\n          slice.slice(q)\n        ].join(\"\")\n      }\n\n      /* Update cursor */\n      cursor = end + length\n\n      /* Append slice and abort if we have two */\n      if (slices.push(slice) === 2)\n        break\n    }\n\n    /* Add suffix, if full results are desired */\n    if (full && cursor < input.length)\n      slices.push(input.slice(cursor))\n\n    /* Return highlighted slices */\n    return slices.join(\"\")\n  })\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { split } from \"../_\"\nimport {\n  Extract,\n  extract\n} from \"../extract\"\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Split a string or set of strings into tokens\n *\n * This tokenizer supersedes the default tokenizer that is provided by Lunr.js,\n * as it is aware of HTML tags and allows for multi-character splitting.\n *\n * It takes the given inputs, splits each of them into markup and text sections,\n * tokenizes and segments (if necessary) each of them, and then indexes them in\n * a table by using a compact bit representation. Bitwise techniques are used\n * to write and read from the table during indexing and querying.\n *\n * @see https://bit.ly/3W3Xw4J - Search: better, faster, smaller\n *\n * @param input - Input value(s)\n *\n * @returns Tokens\n */\nexport function tokenize(\n  input?: string | string[]\n): lunr.Token[] {\n  const tokens: lunr.Token[] = []\n  if (typeof input === \"undefined\")\n    return tokens\n\n  /* Tokenize strings one after another */\n  const inputs = Array.isArray(input) ? input : [input]\n  for (let i = 0; i < inputs.length; i++) {\n    const table = lunr.tokenizer.table\n    const total = table.length\n\n    /* Split string into sections and tokenize content blocks */\n    extract(inputs[i], (block, type, start, end) => {\n      table[block += total] ||= []\n      switch (type) {\n\n        /* Handle markup */\n        case Extract.TAG_OPEN:\n        case Extract.TAG_CLOSE:\n          table[block].push(\n            start       << 12 |\n            end - start <<  2 |\n            type\n          )\n          break\n\n        /* Handle text content */\n        case Extract.TEXT:\n          const section = inputs[i].slice(start, end)\n          split(section, lunr.tokenizer.separator, (index, until) => {\n\n            /**\n             * Apply segmenter after tokenization. Note that the segmenter will\n             * also split words at word boundaries, which is not what we want,\n             * so we need to check if we can somehow mitigate this behavior.\n             */\n            if (typeof lunr.segmenter !== \"undefined\") {\n              const subsection = section.slice(index, until)\n              if (/^[MHIK]$/.test(lunr.segmenter.ctype_(subsection))) {\n                const segments = lunr.segmenter.segment(subsection)\n                for (let s = 0, l = 0; s < segments.length; s++) {\n\n                  /* Add block to section */\n                  table[block] ||= []\n                  table[block].push(\n                    start + index + l  << 12 |\n                    segments[s].length <<  2 |\n                    type\n                  )\n\n                  /* Add token with position */\n                  tokens.push(new lunr.Token(\n                    segments[s].toLowerCase(), {\n                      position: block << 20 | table[block].length - 1\n                    }\n                  ))\n\n                  /* Keep track of length */\n                  l += segments[s].length\n                }\n                return\n              }\n            }\n\n            /* Add block to section */\n            table[block].push(\n              start + index << 12 |\n              until - index <<  2 |\n              type\n            )\n\n            /* Add token with position */\n            tokens.push(new lunr.Token(\n              section.slice(index, until).toLowerCase(), {\n                position: block << 20 | table[block].length - 1\n              }\n            ))\n          })\n      }\n    })\n  }\n\n  /* Return tokens */\n  return tokens\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\n/* ----------------------------------------------------------------------------\n * Helper types\n * ------------------------------------------------------------------------- */\n\n/**\n * Visitor function\n *\n * @param value - String value\n *\n * @returns String term(s)\n */\ntype VisitorFn = (\n  value: string\n) => string | string[]\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Default transformation function\n *\n * 1. Trim excess whitespace from left and right.\n *\n * 2. Search for parts in quotation marks and prepend a `+` modifier to denote\n *    that the resulting document must contain all parts, converting the query\n *    to an `AND` query (as opposed to the default `OR` behavior). While users\n *    may expect parts enclosed in quotation marks to map to span queries, i.e.\n *    for which order is important, Lunr.js doesn't support them, so the best\n *    we can do is to convert the parts to an `AND` query.\n *\n * 3. Replace control characters which are not located at the beginning of the\n *    query or preceded by white space, or are not followed by a non-whitespace\n *    character or are at the end of the query string. Furthermore, filter\n *    unmatched quotation marks.\n *\n * 4. Split the query string at whitespace, then pass each part to the visitor\n *    function for tokenization, and append a wildcard to every resulting term\n *    that is not explicitly marked with a `+`, `-`, `~` or `^` modifier, since\n *    it ensures consistent and stable ranking when multiple terms are entered.\n *    Also, if a fuzzy or boost modifier are given, but no numeric value has\n *    been entered, default to 1 to not induce a query error.\n *\n * @param query - Query value\n * @param fn - Visitor function\n *\n * @returns Transformed query value\n */\nexport function transform(\n  query: string, fn: VisitorFn = term => term\n): string {\n  return query\n\n    /* => 1 */\n    .trim()\n\n    /* => 2 */\n    .split(/\"([^\"]+)\"/g)\n      .map((parts, index) => index & 1\n        ? parts.replace(/^\\b|^(?![^\\x00-\\x7F]|$)|\\s+/g, \" +\")\n        : parts\n      )\n      .join(\"\")\n\n    /* => 3 */\n    .replace(/\"|(?:^|\\s+)[*+\\-:^~]+(?=\\s+|$)/g, \"\")\n\n    /* => 4 */\n    .split(/\\s+/g)\n      .reduce((prev, term) => {\n        const next = fn(term)\n        return [...prev, ...Array.isArray(next) ? next : [next]]\n      }, [] as string[])\n      .map(term => /([~^]$)/.test(term) ? `${term}1` : term)\n      .map(term => /(^[+-]|[~^]\\d+$)/.test(term) ? term : `${term}*`)\n      .join(\" \")\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport { split } from \"../../internal\"\nimport { transform } from \"../transform\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Search query clause\n */\nexport interface SearchQueryClause {\n  presence: lunr.Query.presence        /* Clause presence */\n  term: string                         /* Clause term */\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Search query terms\n */\nexport type SearchQueryTerms = Record<string, boolean>\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Transform search query\n *\n * This function lexes the given search query and applies the transformation\n * function to each term, preserving markup like `+` and `-` modifiers.\n *\n * @param query - Search query\n *\n * @returns Search query\n */\nexport function transformSearchQuery(\n  query: string\n): string {\n\n  /* Split query terms with tokenizer */\n  return transform(query, part => {\n    const terms: string[] = []\n\n    /* Initialize lexer and analyze part */\n    const lexer = new lunr.QueryLexer(part)\n    lexer.run()\n\n    /* Extract and tokenize term from lexeme */\n    for (const { type, str: term, start, end } of lexer.lexemes)\n      switch (type) {\n\n        /* Hack: remove colon - see https://bit.ly/3wD3T3I */\n        case \"FIELD\":\n          if (![\"title\", \"text\", \"tags\"].includes(term))\n            part = [\n              part.slice(0, end),\n              \" \",\n              part.slice(end + 1)\n            ].join(\"\")\n          break\n\n        /* Tokenize term */\n        case \"TERM\":\n          split(term, lunr.tokenizer.separator, (...range) => {\n            terms.push([\n              part.slice(0, start),\n              term.slice(...range),\n              part.slice(end)\n            ].join(\"\"))\n          })\n      }\n\n    /* Return terms */\n    return terms\n  })\n}\n\n/* ------------------------------------------------------------------------- */\n\n/**\n * Parse a search query for analysis\n *\n * Lunr.js itself has a bug where it doesn't detect or remove wildcards for\n * query clauses, so we must do this here.\n *\n * @see https://bit.ly/3DpTGtz - GitHub issue\n *\n * @param value - Query value\n *\n * @returns Search query clauses\n */\nexport function parseSearchQuery(\n  value: string\n): SearchQueryClause[] {\n  const query  = new lunr.Query([\"title\", \"text\", \"tags\"])\n  const parser = new lunr.QueryParser(value, query)\n\n  /* Parse Search query */\n  parser.parse()\n  for (const clause of query.clauses) {\n    clause.usePipeline = true\n\n    /* Handle leading wildcard */\n    if (clause.term.startsWith(\"*\")) {\n      clause.wildcard = lunr.Query.wildcard.LEADING\n      clause.term = clause.term.slice(1)\n    }\n\n    /* Handle trailing wildcard */\n    if (clause.term.endsWith(\"*\")) {\n      clause.wildcard = lunr.Query.wildcard.TRAILING\n      clause.term = clause.term.slice(0, -1)\n    }\n  }\n\n  /* Return query clauses */\n  return query.clauses\n}\n\n/**\n * Analyze the search query clauses in regard to the search terms found\n *\n * @param query - Search query clauses\n * @param terms - Search terms\n *\n * @returns Search query terms\n */\nexport function getSearchQueryTerms(\n  query: SearchQueryClause[], terms: string[]\n): SearchQueryTerms {\n  const clauses = new Set<SearchQueryClause>(query)\n\n  /* Match query clauses against terms */\n  const result: SearchQueryTerms = {}\n  for (let t = 0; t < terms.length; t++)\n    for (const clause of clauses)\n      if (terms[t].startsWith(clause.term)) {\n        result[clause.term] = true\n        clauses.delete(clause)\n      }\n\n  /* Annotate unmatched non-stopword query clauses */\n  for (const clause of clauses)\n    if (lunr.stopWordFilter?.(clause.term))\n      result[clause.term] = false\n\n  /* Return query terms */\n  return result\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Segment a search query using the inverted index\n *\n * This function implements a clever approach to text segmentation for Asian\n * languages, as it used the information already available in the search index.\n * The idea is to greedily segment the search query based on the tokens that are\n * already part of the index, as described in the linked issue.\n *\n * @see https://bit.ly/3lwjrk7 - GitHub issue\n *\n * @param query - Query value\n * @param index - Inverted index\n *\n * @returns Segmented query value\n */\nexport function segment(\n  query: string, index: object\n): Iterable<string> {\n  const segments = new Set<string>()\n\n  /* Segment search query */\n  const wordcuts = new Uint16Array(query.length)\n  for (let i = 0; i < query.length; i++)\n    for (let j = i + 1; j < query.length; j++) {\n      const value = query.slice(i, j)\n      if (value in index)\n        wordcuts[i] = j - i\n    }\n\n  /* Compute longest matches with minimum overlap */\n  const stack = [0]\n  for (let s = stack.length; s > 0;) {\n    const p = stack[--s]\n    for (let q = 1; q < wordcuts[p]; q++)\n      if (wordcuts[p + q] > wordcuts[p] - q) {\n        segments.add(query.slice(p, p + q))\n        stack[s++] = p + q\n      }\n\n    /* Continue at end of query string */\n    const q = p + wordcuts[p]\n    if (wordcuts[q] && q < query.length - 1)\n      stack[s++] = q\n\n    /* Add current segment */\n    segments.add(query.slice(p, q))\n  }\n\n  // @todo fix this case in the code block above, this is a hotfix\n  if (segments.has(\"\"))\n    return new Set([query])\n\n  /* Return segmented query value */\n  return segments\n}\n", "/*\n * Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport {\n  SearchDocument,\n  SearchIndex,\n  SearchOptions,\n  setupSearchDocumentMap\n} from \"../config\"\nimport {\n  Position,\n  PositionTable,\n  highlight,\n  highlightAll,\n  tokenize\n} from \"../internal\"\nimport {\n  SearchQueryTerms,\n  getSearchQueryTerms,\n  parseSearchQuery,\n  segment,\n  transformSearchQuery\n} from \"../query\"\n\n/* ----------------------------------------------------------------------------\n * Types\n * ------------------------------------------------------------------------- */\n\n/**\n * Search item\n */\nexport interface SearchItem\n  extends SearchDocument\n{\n  score: number                        /* Score (relevance) */\n  terms: SearchQueryTerms              /* Search query terms */\n}\n\n/**\n * Search result\n */\nexport interface SearchResult {\n  items: SearchItem[][]                /* Search items */\n  suggest?: string[]                   /* Search suggestions */\n}\n\n/* ----------------------------------------------------------------------------\n * Functions\n * ------------------------------------------------------------------------- */\n\n/**\n * Create field extractor factory\n *\n * @param table - Position table map\n *\n * @returns Extractor factory\n */\nfunction extractor(table: Map<string, PositionTable>) {\n  return (name: keyof SearchDocument) => {\n    return (doc: SearchDocument) => {\n      if (typeof doc[name] === \"undefined\")\n        return undefined\n\n      /* Compute identifier and initialize table */\n      const id = [doc.location, name].join(\":\")\n      table.set(id, lunr.tokenizer.table = [])\n\n      /* Return field value */\n      return doc[name]\n    }\n  }\n}\n\n/**\n * Compute the difference of two lists of strings\n *\n * @param a - 1st list of strings\n * @param b - 2nd list of strings\n *\n * @returns Difference\n */\nfunction difference(a: string[], b: string[]): string[] {\n  const [x, y] = [new Set(a), new Set(b)]\n  return [\n    ...new Set([...x].filter(value => !y.has(value)))\n  ]\n}\n\n/* ----------------------------------------------------------------------------\n * Class\n * ------------------------------------------------------------------------- */\n\n/**\n * Search index\n */\nexport class Search {\n\n  /**\n   * Search document map\n   */\n  protected map: Map<string, SearchDocument>\n\n  /**\n   * Search options\n   */\n  protected options: SearchOptions\n\n  /**\n   * The underlying Lunr.js search index\n   */\n  protected index: lunr.Index\n\n  /**\n   * Internal position table map\n   */\n  protected table: Map<string, PositionTable>\n\n  /**\n   * Create the search integration\n   *\n   * @param data - Search index\n   */\n  public constructor({ config, docs, options }: SearchIndex) {\n    const field = extractor(this.table = new Map())\n\n    /* Set up document map and options */\n    this.map = setupSearchDocumentMap(docs)\n    this.options = options\n\n    /* Set up document index */\n    this.index = lunr(function () {\n      this.metadataWhitelist = [\"position\"]\n      this.b(0)\n\n      /* Set up (multi-)language support */\n      if (config.lang.length === 1 && config.lang[0] !== \"en\") {\n        // @ts-expect-error - namespace indexing not supported\n        this.use(lunr[config.lang[0]])\n      } else if (config.lang.length > 1) {\n        this.use(lunr.multiLanguage(...config.lang))\n      }\n\n      /* Set up custom tokenizer (must be after language setup) */\n      this.tokenizer = tokenize as typeof lunr.tokenizer\n      lunr.tokenizer.separator = new RegExp(config.separator)\n\n      /* Set up custom segmenter, if loaded */\n      lunr.segmenter = \"TinySegmenter\" in lunr\n        ? new lunr.TinySegmenter()\n        : undefined\n\n      /* Compute functions to be removed from the pipeline */\n      const fns = difference([\n        \"trimmer\", \"stopWordFilter\", \"stemmer\"\n      ], config.pipeline)\n\n      /* Remove functions from the pipeline for registered languages */\n      for (const lang of config.lang.map(language => (\n        // @ts-expect-error - namespace indexing not supported\n        language === \"en\" ? lunr : lunr[language]\n      )))\n        for (const fn of fns) {\n          this.pipeline.remove(lang[fn])\n          this.searchPipeline.remove(lang[fn])\n        }\n\n      /* Set up index reference */\n      this.ref(\"location\")\n\n      /* Set up index fields */\n      this.field(\"title\", { boost: 1e3, extractor: field(\"title\") })\n      this.field(\"text\",  { boost: 1e0, extractor: field(\"text\") })\n      this.field(\"tags\",  { boost: 1e6, extractor: field(\"tags\") })\n\n      /* Add documents to index */\n      for (const doc of docs)\n        this.add(doc, { boost: doc.boost })\n    })\n  }\n\n  /**\n   * Search for matching documents\n   *\n   * @param query - Search query\n   *\n   * @returns Search result\n   */\n  public search(query: string): SearchResult {\n\n    // Experimental Chinese segmentation\n    query = query.replace(/\\p{sc=Han}+/gu, value => {\n      return [...segment(value, this.index.invertedIndex)]\n        .join(\"* \")\n    })\n\n    // @todo: move segmenter (above) into transformSearchQuery\n    query = transformSearchQuery(query)\n    if (!query)\n      return { items: [] }\n\n    /* Parse query to extract clauses for analysis */\n    const clauses = parseSearchQuery(query)\n      .filter(clause => (\n        clause.presence !== lunr.Query.presence.PROHIBITED\n      ))\n\n    /* Perform search and post-process results */\n    const groups = this.index.search(query)\n\n      /* Apply post-query boosts based on title and search query terms */\n      .reduce<SearchItem[]>((item, { ref, score, matchData }) => {\n        let doc = this.map.get(ref)\n        if (typeof doc !== \"undefined\") {\n\n          /* Shallow copy document */\n          doc = { ...doc }\n          if (doc.tags)\n            doc.tags = [...doc.tags]\n\n          /* Compute and analyze search query terms */\n          const terms = getSearchQueryTerms(\n            clauses,\n            Object.keys(matchData.metadata)\n          )\n\n          /* Highlight matches in fields */\n          for (const field of this.index.fields) {\n            if (typeof doc[field] === \"undefined\")\n              continue\n\n            /* Collect positions from matches */\n            const positions: Position[] = []\n            for (const match of Object.values(matchData.metadata))\n              if (typeof match[field] !== \"undefined\")\n                positions.push(...match[field].position)\n\n            /* Skip highlighting, if no positions were collected */\n            if (!positions.length)\n              continue\n\n            /* Load table and determine highlighting method */\n            const table = this.table.get([doc.location, field].join(\":\"))!\n            const fn = Array.isArray(doc[field])\n              ? highlightAll\n              : highlight\n\n            // @ts-expect-error - stop moaning, TypeScript!\n            doc[field] = fn(doc[field], table, positions, field !== \"text\")\n          }\n\n          /* Highlight title and text and apply post-query boosts */\n          const boost = +!doc.parent +\n            Object.values(terms)\n              .filter(t => t).length /\n            Object.keys(terms).length\n\n          /* Append item */\n          item.push({\n            ...doc,\n            score: score * (1 + boost ** 2),\n            terms\n          })\n        }\n        return item\n      }, [])\n\n      /* Sort search results again after applying boosts */\n      .sort((a, b) => b.score - a.score)\n\n      /* Group search results by article */\n      .reduce((items, result) => {\n        const doc = this.map.get(result.location)\n        if (typeof doc !== \"undefined\") {\n          const ref = doc.parent\n            ? doc.parent.location\n            : doc.location\n          items.set(ref, [...items.get(ref) || [], result])\n        }\n        return items\n      }, new Map<string, SearchItem[]>())\n\n    /* Ensure that every item set has an article */\n    for (const [ref, items] of groups)\n      if (!items.find(item => item.location === ref)) {\n        const doc = this.map.get(ref)!\n        items.push({ ...doc, score: 0, terms: {} })\n      }\n\n    /* Generate search suggestions, if desired */\n    let suggest: string[] | undefined\n    if (this.options.suggest) {\n      const titles = this.index.query(builder => {\n        for (const clause of clauses)\n          builder.term(clause.term, {\n            fields: [\"title\"],\n            presence: lunr.Query.presence.REQUIRED,\n            wildcard: lunr.Query.wildcard.TRAILING\n          })\n      })\n\n      /* Retrieve suggestions for best match */\n      suggest = titles.length\n        ? Object.keys(titles[0].matchData.metadata)\n        : []\n    }\n\n    /* Return search result */\n    return {\n      items: [...groups.values()],\n      ...typeof suggest !== \"undefined\" && { suggest }\n    }\n  }\n}\n"],
+  "mappings": "6lCAAA,IAAAA,GAAAC,GAAA,CAAAC,EAAAC,KAAA;AAAA;AAAA;AAAA;AAAA,IAME,UAAU,CAiCZ,IAAIC,EAAO,SAAUC,EAAQ,CAC3B,IAAIC,EAAU,IAAIF,EAAK,QAEvB,OAAAE,EAAQ,SAAS,IACfF,EAAK,QACLA,EAAK,eACLA,EAAK,OACP,EAEAE,EAAQ,eAAe,IACrBF,EAAK,OACP,EAEAC,EAAO,KAAKC,EAASA,CAAO,EACrBA,EAAQ,MAAM,CACvB,EAEAF,EAAK,QAAU,QACf;AAAA;AAAA;AAAA,GASAA,EAAK,MAAQ,CAAC,EASdA,EAAK,MAAM,KAAQ,SAAUG,EAAQ,CAEnC,OAAO,SAAUC,EAAS,CACpBD,EAAO,SAAW,QAAQ,MAC5B,QAAQ,KAAKC,CAAO,CAExB,CAEF,EAAG,IAAI,EAaPJ,EAAK,MAAM,SAAW,SAAUK,EAAK,CACnC,OAAsBA,GAAQ,KACrB,GAEAA,EAAI,SAAS,CAExB,EAkBAL,EAAK,MAAM,MAAQ,SAAUK,EAAK,CAChC,GAAIA,GAAQ,KACV,OAAOA,EAMT,QAHIC,EAAQ,OAAO,OAAO,IAAI,EAC1BC,EAAO,OAAO,KAAKF,CAAG,EAEjB,EAAI,EAAG,EAAIE,EAAK,OAAQ,IAAK,CACpC,IAAIC,EAAMD,EAAK,CAAC,EACZE,EAAMJ,EAAIG,CAAG,EAEjB,GAAI,MAAM,QAAQC,CAAG,EAAG,CACtBH,EAAME,CAAG,EAAIC,EAAI,MAAM,EACvB,QACF,CAEA,GAAI,OAAOA,GAAQ,UACf,OAAOA,GAAQ,UACf,OAAOA,GAAQ,UAAW,CAC5BH,EAAME,CAAG,EAAIC,EACb,QACF,CAEA,MAAM,IAAI,UAAU,uDAAuD,CAC7E,CAEA,OAAOH,CACT,EACAN,EAAK,SAAW,SAAUU,EAAQC,EAAWC,EAAa,CACxD,KAAK,OAASF,EACd,KAAK,UAAYC,EACjB,KAAK,aAAeC,CACtB,EAEAZ,EAAK,SAAS,OAAS,IAEvBA,EAAK,SAAS,WAAa,SAAUa,EAAG,CACtC,IAAIC,EAAID,EAAE,QAAQb,EAAK,SAAS,MAAM,EAEtC,GAAIc,IAAM,GACR,KAAM,6BAGR,IAAIC,EAAWF,EAAE,MAAM,EAAGC,CAAC,EACvBJ,EAASG,EAAE,MAAMC,EAAI,CAAC,EAE1B,OAAO,IAAId,EAAK,SAAUU,EAAQK,EAAUF,CAAC,CAC/C,EAEAb,EAAK,SAAS,UAAU,SAAW,UAAY,CAC7C,OAAI,KAAK,cAAgB,OACvB,KAAK,aAAe,KAAK,UAAYA,EAAK,SAAS,OAAS,KAAK,QAG5D,KAAK,YACd,EACA;AAAA;AAAA;AAAA,GAUAA,EAAK,IAAM,SAAUgB,EAAU,CAG7B,GAFA,KAAK,SAAW,OAAO,OAAO,IAAI,EAE9BA,EAAU,CACZ,KAAK,OAASA,EAAS,OAEvB,QAASC,EAAI,EAAGA,EAAI,KAAK,OAAQA,IAC/B,KAAK,SAASD,EAASC,CAAC,CAAC,EAAI,EAEjC,MACE,KAAK,OAAS,CAElB,EASAjB,EAAK,IAAI,SAAW,CAClB,UAAW,SAAUkB,EAAO,CAC1B,OAAOA,CACT,EAEA,MAAO,UAAY,CACjB,OAAO,IACT,EAEA,SAAU,UAAY,CACpB,MAAO,EACT,CACF,EASAlB,EAAK,IAAI,MAAQ,CACf,UAAW,UAAY,CACrB,OAAO,IACT,EAEA,MAAO,SAAUkB,EAAO,CACtB,OAAOA,CACT,EAEA,SAAU,UAAY,CACpB,MAAO,EACT,CACF,EAQAlB,EAAK,IAAI,UAAU,SAAW,SAAUmB,EAAQ,CAC9C,MAAO,CAAC,CAAC,KAAK,SAASA,CAAM,CAC/B,EAUAnB,EAAK,IAAI,UAAU,UAAY,SAAUkB,EAAO,CAC9C,IAAIE,EAAGC,EAAGL,EAAUM,EAAe,CAAC,EAEpC,GAAIJ,IAAUlB,EAAK,IAAI,SACrB,OAAO,KAGT,GAAIkB,IAAUlB,EAAK,IAAI,MACrB,OAAOkB,EAGL,KAAK,OAASA,EAAM,QACtBE,EAAI,KACJC,EAAIH,IAEJE,EAAIF,EACJG,EAAI,MAGNL,EAAW,OAAO,KAAKI,EAAE,QAAQ,EAEjC,QAASH,EAAI,EAAGA,EAAID,EAAS,OAAQC,IAAK,CACxC,IAAIM,EAAUP,EAASC,CAAC,EACpBM,KAAWF,EAAE,UACfC,EAAa,KAAKC,CAAO,CAE7B,CAEA,OAAO,IAAIvB,EAAK,IAAKsB,CAAY,CACnC,EASAtB,EAAK,IAAI,UAAU,MAAQ,SAAUkB,EAAO,CAC1C,OAAIA,IAAUlB,EAAK,IAAI,SACdA,EAAK,IAAI,SAGdkB,IAAUlB,EAAK,IAAI,MACd,KAGF,IAAIA,EAAK,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,OAAO,OAAO,KAAKkB,EAAM,QAAQ,CAAC,CAAC,CACpF,EASAlB,EAAK,IAAM,SAAUwB,EAASC,EAAe,CAC3C,IAAIC,EAAoB,EAExB,QAASf,KAAaa,EAChBb,GAAa,WACjBe,GAAqB,OAAO,KAAKF,EAAQb,CAAS,CAAC,EAAE,QAGvD,IAAIgB,GAAKF,EAAgBC,EAAoB,KAAQA,EAAoB,IAEzE,OAAO,KAAK,IAAI,EAAI,KAAK,IAAIC,CAAC,CAAC,CACjC,EAUA3B,EAAK,MAAQ,SAAU4B,EAAKC,EAAU,CACpC,KAAK,IAAMD,GAAO,GAClB,KAAK,SAAWC,GAAY,CAAC,CAC/B,EAOA7B,EAAK,MAAM,UAAU,SAAW,UAAY,CAC1C,OAAO,KAAK,GACd,EAsBAA,EAAK,MAAM,UAAU,OAAS,SAAU8B,EAAI,CAC1C,YAAK,IAAMA,EAAG,KAAK,IAAK,KAAK,QAAQ,EAC9B,IACT,EASA9B,EAAK,MAAM,UAAU,MAAQ,SAAU8B,EAAI,CACzC,OAAAA,EAAKA,GAAM,SAAUjB,EAAG,CAAE,OAAOA,CAAE,EAC5B,IAAIb,EAAK,MAAO8B,EAAG,KAAK,IAAK,KAAK,QAAQ,EAAG,KAAK,QAAQ,CACnE,EACA;AAAA;AAAA;AAAA,GAuBA9B,EAAK,UAAY,SAAUK,EAAKwB,EAAU,CACxC,GAAIxB,GAAO,MAAQA,GAAO,KACxB,MAAO,CAAC,EAGV,GAAI,MAAM,QAAQA,CAAG,EACnB,OAAOA,EAAI,IAAI,SAAU0B,EAAG,CAC1B,OAAO,IAAI/B,EAAK,MACdA,EAAK,MAAM,SAAS+B,CAAC,EAAE,YAAY,EACnC/B,EAAK,MAAM,MAAM6B,CAAQ,CAC3B,CACF,CAAC,EAOH,QAJID,EAAMvB,EAAI,SAAS,EAAE,YAAY,EACjC2B,EAAMJ,EAAI,OACVK,EAAS,CAAC,EAELC,EAAW,EAAGC,EAAa,EAAGD,GAAYF,EAAKE,IAAY,CAClE,IAAIE,EAAOR,EAAI,OAAOM,CAAQ,EAC1BG,EAAcH,EAAWC,EAE7B,GAAKC,EAAK,MAAMpC,EAAK,UAAU,SAAS,GAAKkC,GAAYF,EAAM,CAE7D,GAAIK,EAAc,EAAG,CACnB,IAAIC,EAAgBtC,EAAK,MAAM,MAAM6B,CAAQ,GAAK,CAAC,EACnDS,EAAc,SAAc,CAACH,EAAYE,CAAW,EACpDC,EAAc,MAAWL,EAAO,OAEhCA,EAAO,KACL,IAAIjC,EAAK,MACP4B,EAAI,MAAMO,EAAYD,CAAQ,EAC9BI,CACF,CACF,CACF,CAEAH,EAAaD,EAAW,CAC1B,CAEF,CAEA,OAAOD,CACT,EASAjC,EAAK,UAAU,UAAY,UAC3B;AAAA;AAAA;AAAA,GAkCAA,EAAK,SAAW,UAAY,CAC1B,KAAK,OAAS,CAAC,CACjB,EAEAA,EAAK,SAAS,oBAAsB,OAAO,OAAO,IAAI,EAmCtDA,EAAK,SAAS,iBAAmB,SAAU8B,EAAIS,EAAO,CAChDA,KAAS,KAAK,qBAChBvC,EAAK,MAAM,KAAK,6CAA+CuC,CAAK,EAGtET,EAAG,MAAQS,EACXvC,EAAK,SAAS,oBAAoB8B,EAAG,KAAK,EAAIA,CAChD,EAQA9B,EAAK,SAAS,4BAA8B,SAAU8B,EAAI,CACxD,IAAIU,EAAeV,EAAG,OAAUA,EAAG,SAAS,KAAK,oBAE5CU,GACHxC,EAAK,MAAM,KAAK;AAAA,EAAmG8B,CAAE,CAEzH,EAYA9B,EAAK,SAAS,KAAO,SAAUyC,EAAY,CACzC,IAAIC,EAAW,IAAI1C,EAAK,SAExB,OAAAyC,EAAW,QAAQ,SAAUE,EAAQ,CACnC,IAAIb,EAAK9B,EAAK,SAAS,oBAAoB2C,CAAM,EAEjD,GAAIb,EACFY,EAAS,IAAIZ,CAAE,MAEf,OAAM,IAAI,MAAM,sCAAwCa,CAAM,CAElE,CAAC,EAEMD,CACT,EASA1C,EAAK,SAAS,UAAU,IAAM,UAAY,CACxC,IAAI4C,EAAM,MAAM,UAAU,MAAM,KAAK,SAAS,EAE9CA,EAAI,QAAQ,SAAUd,EAAI,CACxB9B,EAAK,SAAS,4BAA4B8B,CAAE,EAC5C,KAAK,OAAO,KAAKA,CAAE,CACrB,EAAG,IAAI,CACT,EAWA9B,EAAK,SAAS,UAAU,MAAQ,SAAU6C,EAAYC,EAAO,CAC3D9C,EAAK,SAAS,4BAA4B8C,CAAK,EAE/C,IAAIC,EAAM,KAAK,OAAO,QAAQF,CAAU,EACxC,GAAIE,GAAO,GACT,MAAM,IAAI,MAAM,wBAAwB,EAG1CA,EAAMA,EAAM,EACZ,KAAK,OAAO,OAAOA,EAAK,EAAGD,CAAK,CAClC,EAWA9C,EAAK,SAAS,UAAU,OAAS,SAAU6C,EAAYC,EAAO,CAC5D9C,EAAK,SAAS,4BAA4B8C,CAAK,EAE/C,IAAIC,EAAM,KAAK,OAAO,QAAQF,CAAU,EACxC,GAAIE,GAAO,GACT,MAAM,IAAI,MAAM,wBAAwB,EAG1C,KAAK,OAAO,OAAOA,EAAK,EAAGD,CAAK,CAClC,EAOA9C,EAAK,SAAS,UAAU,OAAS,SAAU8B,EAAI,CAC7C,IAAIiB,EAAM,KAAK,OAAO,QAAQjB,CAAE,EAC5BiB,GAAO,IAIX,KAAK,OAAO,OAAOA,EAAK,CAAC,CAC3B,EASA/C,EAAK,SAAS,UAAU,IAAM,SAAUiC,EAAQ,CAG9C,QAFIe,EAAc,KAAK,OAAO,OAErB/B,EAAI,EAAGA,EAAI+B,EAAa/B,IAAK,CAIpC,QAHIa,EAAK,KAAK,OAAOb,CAAC,EAClBgC,EAAO,CAAC,EAEHC,EAAI,EAAGA,EAAIjB,EAAO,OAAQiB,IAAK,CACtC,IAAIC,EAASrB,EAAGG,EAAOiB,CAAC,EAAGA,EAAGjB,CAAM,EAEpC,GAAI,EAAAkB,GAAW,MAA6BA,IAAW,IAEvD,GAAI,MAAM,QAAQA,CAAM,EACtB,QAASC,EAAI,EAAGA,EAAID,EAAO,OAAQC,IACjCH,EAAK,KAAKE,EAAOC,CAAC,CAAC,OAGrBH,EAAK,KAAKE,CAAM,CAEpB,CAEAlB,EAASgB,CACX,CAEA,OAAOhB,CACT,EAYAjC,EAAK,SAAS,UAAU,UAAY,SAAU4B,EAAKC,EAAU,CAC3D,IAAIwB,EAAQ,IAAIrD,EAAK,MAAO4B,EAAKC,CAAQ,EAEzC,OAAO,KAAK,IAAI,CAACwB,CAAK,CAAC,EAAE,IAAI,SAAUtB,EAAG,CACxC,OAAOA,EAAE,SAAS,CACpB,CAAC,CACH,EAMA/B,EAAK,SAAS,UAAU,MAAQ,UAAY,CAC1C,KAAK,OAAS,CAAC,CACjB,EASAA,EAAK,SAAS,UAAU,OAAS,UAAY,CAC3C,OAAO,KAAK,OAAO,IAAI,SAAU8B,EAAI,CACnC,OAAA9B,EAAK,SAAS,4BAA4B8B,CAAE,EAErCA,EAAG,KACZ,CAAC,CACH,EACA;AAAA;AAAA;AAAA,GAqBA9B,EAAK,OAAS,SAAUgB,EAAU,CAChC,KAAK,WAAa,EAClB,KAAK,SAAWA,GAAY,CAAC,CAC/B,EAaAhB,EAAK,OAAO,UAAU,iBAAmB,SAAUsD,EAAO,CAExD,GAAI,KAAK,SAAS,QAAU,EAC1B,MAAO,GAST,QANIC,EAAQ,EACRC,EAAM,KAAK,SAAS,OAAS,EAC7BnB,EAAcmB,EAAMD,EACpBE,EAAa,KAAK,MAAMpB,EAAc,CAAC,EACvCqB,EAAa,KAAK,SAASD,EAAa,CAAC,EAEtCpB,EAAc,IACfqB,EAAaJ,IACfC,EAAQE,GAGNC,EAAaJ,IACfE,EAAMC,GAGJC,GAAcJ,IAIlBjB,EAAcmB,EAAMD,EACpBE,EAAaF,EAAQ,KAAK,MAAMlB,EAAc,CAAC,EAC/CqB,EAAa,KAAK,SAASD,EAAa,CAAC,EAO3C,GAJIC,GAAcJ,GAIdI,EAAaJ,EACf,OAAOG,EAAa,EAGtB,GAAIC,EAAaJ,EACf,OAAQG,EAAa,GAAK,CAE9B,EAWAzD,EAAK,OAAO,UAAU,OAAS,SAAU2D,EAAWlD,EAAK,CACvD,KAAK,OAAOkD,EAAWlD,EAAK,UAAY,CACtC,KAAM,iBACR,CAAC,CACH,EAUAT,EAAK,OAAO,UAAU,OAAS,SAAU2D,EAAWlD,EAAKqB,EAAI,CAC3D,KAAK,WAAa,EAClB,IAAI8B,EAAW,KAAK,iBAAiBD,CAAS,EAE1C,KAAK,SAASC,CAAQ,GAAKD,EAC7B,KAAK,SAASC,EAAW,CAAC,EAAI9B,EAAG,KAAK,SAAS8B,EAAW,CAAC,EAAGnD,CAAG,EAEjE,KAAK,SAAS,OAAOmD,EAAU,EAAGD,EAAWlD,CAAG,CAEpD,EAOAT,EAAK,OAAO,UAAU,UAAY,UAAY,CAC5C,GAAI,KAAK,WAAY,OAAO,KAAK,WAKjC,QAHI6D,EAAe,EACfC,EAAiB,KAAK,SAAS,OAE1B7C,EAAI,EAAGA,EAAI6C,EAAgB7C,GAAK,EAAG,CAC1C,IAAIR,EAAM,KAAK,SAASQ,CAAC,EACzB4C,GAAgBpD,EAAMA,CACxB,CAEA,OAAO,KAAK,WAAa,KAAK,KAAKoD,CAAY,CACjD,EAQA7D,EAAK,OAAO,UAAU,IAAM,SAAU+D,EAAa,CAOjD,QANIC,EAAa,EACb5C,EAAI,KAAK,SAAUC,EAAI0C,EAAY,SACnCE,EAAO7C,EAAE,OAAQ8C,EAAO7C,EAAE,OAC1B8C,EAAO,EAAGC,EAAO,EACjBnD,EAAI,EAAGiC,EAAI,EAERjC,EAAIgD,GAAQf,EAAIgB,GACrBC,EAAO/C,EAAEH,CAAC,EAAGmD,EAAO/C,EAAE6B,CAAC,EACnBiB,EAAOC,EACTnD,GAAK,EACIkD,EAAOC,EAChBlB,GAAK,EACIiB,GAAQC,IACjBJ,GAAc5C,EAAEH,EAAI,CAAC,EAAII,EAAE6B,EAAI,CAAC,EAChCjC,GAAK,EACLiC,GAAK,GAIT,OAAOc,CACT,EASAhE,EAAK,OAAO,UAAU,WAAa,SAAU+D,EAAa,CACxD,OAAO,KAAK,IAAIA,CAAW,EAAI,KAAK,UAAU,GAAK,CACrD,EAOA/D,EAAK,OAAO,UAAU,QAAU,UAAY,CAG1C,QAFIqE,EAAS,IAAI,MAAO,KAAK,SAAS,OAAS,CAAC,EAEvCpD,EAAI,EAAGiC,EAAI,EAAGjC,EAAI,KAAK,SAAS,OAAQA,GAAK,EAAGiC,IACvDmB,EAAOnB,CAAC,EAAI,KAAK,SAASjC,CAAC,EAG7B,OAAOoD,CACT,EAOArE,EAAK,OAAO,UAAU,OAAS,UAAY,CACzC,OAAO,KAAK,QACd,EAEA;AAAA;AAAA;AAAA;AAAA,GAiBAA,EAAK,QAAW,UAAU,CACxB,IAAIsE,EAAY,CACZ,QAAY,MACZ,OAAW,OACX,KAAS,OACT,KAAS,OACT,KAAS,MACT,IAAQ,MACR,KAAS,KACT,MAAU,MACV,IAAQ,IACR,MAAU,MACV,QAAY,MACZ,MAAU,MACV,KAAS,MACT,MAAU,KACV,QAAY,MACZ,QAAY,MACZ,QAAY,MACZ,MAAU,KACV,MAAU,MACV,OAAW,MACX,KAAS,KACX,EAEAC,EAAY,CACV,MAAU,KACV,MAAU,GACV,MAAU,KACV,MAAU,KACV,KAAS,KACT,IAAQ,GACR,KAAS,EACX,EAEAC,EAAI,WACJC,EAAI,WACJC,EAAIF,EAAI,aACRG,EAAIF,EAAI,WAERG,EAAO,KAAOF,EAAI,KAAOC,EAAID,EAC7BG,EAAO,KAAOH,EAAI,KAAOC,EAAID,EAAI,IAAMC,EAAI,MAC3CG,EAAO,KAAOJ,EAAI,KAAOC,EAAID,EAAIC,EAAID,EACrCK,EAAM,KAAOL,EAAI,KAAOD,EAEtBO,EAAU,IAAI,OAAOJ,CAAI,EACzBK,EAAU,IAAI,OAAOH,CAAI,EACzBI,EAAU,IAAI,OAAOL,CAAI,EACzBM,EAAS,IAAI,OAAOJ,CAAG,EAEvBK,EAAQ,kBACRC,EAAS,iBACTC,EAAQ,aACRC,EAAS,kBACTC,EAAU,KACVC,EAAW,cACXC,EAAW,IAAI,OAAO,oBAAoB,EAC1CC,EAAW,IAAI,OAAO,IAAMjB,EAAID,EAAI,cAAc,EAElDmB,EAAQ,mBACRC,EAAO,2IAEPC,EAAO,iDAEPC,EAAO,sFACPC,EAAQ,oBAERC,EAAO,WACPC,EAAS,MACTC,EAAQ,IAAI,OAAO,IAAMzB,EAAID,EAAI,cAAc,EAE/C2B,EAAgB,SAAuBC,EAAG,CAC5C,IAAIC,EACFC,EACAC,EACAC,EACAC,EACAC,EACAC,EAEF,GAAIP,EAAE,OAAS,EAAK,OAAOA,EAiB3B,GAfAG,EAAUH,EAAE,OAAO,EAAE,CAAC,EAClBG,GAAW,MACbH,EAAIG,EAAQ,YAAY,EAAIH,EAAE,OAAO,CAAC,GAIxCI,EAAKrB,EACLsB,EAAMrB,EAEFoB,EAAG,KAAKJ,CAAC,EAAKA,EAAIA,EAAE,QAAQI,EAAG,MAAM,EAChCC,EAAI,KAAKL,CAAC,IAAKA,EAAIA,EAAE,QAAQK,EAAI,MAAM,GAGhDD,EAAKnB,EACLoB,EAAMnB,EACFkB,EAAG,KAAKJ,CAAC,EAAG,CACd,IAAIQ,EAAKJ,EAAG,KAAKJ,CAAC,EAClBI,EAAKzB,EACDyB,EAAG,KAAKI,EAAG,CAAC,CAAC,IACfJ,EAAKjB,EACLa,EAAIA,EAAE,QAAQI,EAAG,EAAE,EAEvB,SAAWC,EAAI,KAAKL,CAAC,EAAG,CACtB,IAAIQ,EAAKH,EAAI,KAAKL,CAAC,EACnBC,EAAOO,EAAG,CAAC,EACXH,EAAMvB,EACFuB,EAAI,KAAKJ,CAAI,IACfD,EAAIC,EACJI,EAAMjB,EACNkB,EAAMjB,EACNkB,EAAMjB,EACFe,EAAI,KAAKL,CAAC,EAAKA,EAAIA,EAAI,IAClBM,EAAI,KAAKN,CAAC,GAAKI,EAAKjB,EAASa,EAAIA,EAAE,QAAQI,EAAG,EAAE,GAChDG,EAAI,KAAKP,CAAC,IAAKA,EAAIA,EAAI,KAEpC,CAIA,GADAI,EAAKb,EACDa,EAAG,KAAKJ,CAAC,EAAG,CACd,IAAIQ,EAAKJ,EAAG,KAAKJ,CAAC,EAClBC,EAAOO,EAAG,CAAC,EACXR,EAAIC,EAAO,GACb,CAIA,GADAG,EAAKZ,EACDY,EAAG,KAAKJ,CAAC,EAAG,CACd,IAAIQ,EAAKJ,EAAG,KAAKJ,CAAC,EAClBC,EAAOO,EAAG,CAAC,EACXN,EAASM,EAAG,CAAC,EACbJ,EAAKzB,EACDyB,EAAG,KAAKH,CAAI,IACdD,EAAIC,EAAOhC,EAAUiC,CAAM,EAE/B,CAIA,GADAE,EAAKX,EACDW,EAAG,KAAKJ,CAAC,EAAG,CACd,IAAIQ,EAAKJ,EAAG,KAAKJ,CAAC,EAClBC,EAAOO,EAAG,CAAC,EACXN,EAASM,EAAG,CAAC,EACbJ,EAAKzB,EACDyB,EAAG,KAAKH,CAAI,IACdD,EAAIC,EAAO/B,EAAUgC,CAAM,EAE/B,CAKA,GAFAE,EAAKV,EACLW,EAAMV,EACFS,EAAG,KAAKJ,CAAC,EAAG,CACd,IAAIQ,EAAKJ,EAAG,KAAKJ,CAAC,EAClBC,EAAOO,EAAG,CAAC,EACXJ,EAAKxB,EACDwB,EAAG,KAAKH,CAAI,IACdD,EAAIC,EAER,SAAWI,EAAI,KAAKL,CAAC,EAAG,CACtB,IAAIQ,EAAKH,EAAI,KAAKL,CAAC,EACnBC,EAAOO,EAAG,CAAC,EAAIA,EAAG,CAAC,EACnBH,EAAMzB,EACFyB,EAAI,KAAKJ,CAAI,IACfD,EAAIC,EAER,CAIA,GADAG,EAAKR,EACDQ,EAAG,KAAKJ,CAAC,EAAG,CACd,IAAIQ,EAAKJ,EAAG,KAAKJ,CAAC,EAClBC,EAAOO,EAAG,CAAC,EACXJ,EAAKxB,EACLyB,EAAMxB,EACNyB,EAAMR,GACFM,EAAG,KAAKH,CAAI,GAAMI,EAAI,KAAKJ,CAAI,GAAK,CAAEK,EAAI,KAAKL,CAAI,KACrDD,EAAIC,EAER,CAEA,OAAAG,EAAKP,EACLQ,EAAMzB,EACFwB,EAAG,KAAKJ,CAAC,GAAKK,EAAI,KAAKL,CAAC,IAC1BI,EAAKjB,EACLa,EAAIA,EAAE,QAAQI,EAAG,EAAE,GAKjBD,GAAW,MACbH,EAAIG,EAAQ,YAAY,EAAIH,EAAE,OAAO,CAAC,GAGjCA,CACT,EAEA,OAAO,SAAUhD,EAAO,CACtB,OAAOA,EAAM,OAAO+C,CAAa,CACnC,CACF,EAAG,EAEHpG,EAAK,SAAS,iBAAiBA,EAAK,QAAS,SAAS,EACtD;AAAA;AAAA;AAAA,GAkBAA,EAAK,uBAAyB,SAAU8G,EAAW,CACjD,IAAIC,EAAQD,EAAU,OAAO,SAAU7D,EAAM+D,EAAU,CACrD,OAAA/D,EAAK+D,CAAQ,EAAIA,EACV/D,CACT,EAAG,CAAC,CAAC,EAEL,OAAO,SAAUI,EAAO,CACtB,GAAIA,GAAS0D,EAAM1D,EAAM,SAAS,CAAC,IAAMA,EAAM,SAAS,EAAG,OAAOA,CACpE,CACF,EAeArD,EAAK,eAAiBA,EAAK,uBAAuB,CAChD,IACA,OACA,QACA,SACA,QACA,MACA,SACA,OACA,KACA,QACA,KACA,MACA,MACA,MACA,KACA,KACA,KACA,UACA,OACA,MACA,KACA,MACA,SACA,QACA,OACA,MACA,KACA,OACA,SACA,OACA,OACA,QACA,MACA,OACA,MACA,MACA,MACA,MACA,OACA,KACA,MACA,OACA,MACA,MACA,MACA,UACA,IACA,KACA,KACA,OACA,KACA,KACA,MACA,OACA,QACA,MACA,OACA,SACA,MACA,KACA,QACA,OACA,OACA,KACA,UACA,KACA,MACA,MACA,KACA,MACA,QACA,KACA,OACA,KACA,QACA,MACA,MACA,SACA,OACA,MACA,OACA,MACA,SACA,QACA,KACA,OACA,OACA,OACA,MACA,QACA,OACA,OACA,QACA,QACA,OACA,OACA,MACA,KACA,MACA,OACA,KACA,QACA,MACA,KACA,OACA,OACA,OACA,QACA,QACA,QACA,MACA,OACA,MACA,OACA,OACA,QACA,MACA,MACA,MACF,CAAC,EAEDA,EAAK,SAAS,iBAAiBA,EAAK,eAAgB,gBAAgB,EACpE;AAAA;AAAA;AAAA,GAoBAA,EAAK,QAAU,SAAUqD,EAAO,CAC9B,OAAOA,EAAM,OAAO,SAAUxC,EAAG,CAC/B,OAAOA,EAAE,QAAQ,OAAQ,EAAE,EAAE,QAAQ,OAAQ,EAAE,CACjD,CAAC,CACH,EAEAb,EAAK,SAAS,iBAAiBA,EAAK,QAAS,SAAS,EACtD;AAAA;AAAA;AAAA,GA0BAA,EAAK,SAAW,UAAY,CAC1B,KAAK,MAAQ,GACb,KAAK,MAAQ,CAAC,EACd,KAAK,GAAKA,EAAK,SAAS,QACxBA,EAAK,SAAS,SAAW,CAC3B,EAUAA,EAAK,SAAS,QAAU,EASxBA,EAAK,SAAS,UAAY,SAAUiH,EAAK,CAGvC,QAFI/G,EAAU,IAAIF,EAAK,SAAS,QAEvBiB,EAAI,EAAGe,EAAMiF,EAAI,OAAQhG,EAAIe,EAAKf,IACzCf,EAAQ,OAAO+G,EAAIhG,CAAC,CAAC,EAGvB,OAAAf,EAAQ,OAAO,EACRA,EAAQ,IACjB,EAWAF,EAAK,SAAS,WAAa,SAAUkH,EAAQ,CAC3C,MAAI,iBAAkBA,EACblH,EAAK,SAAS,gBAAgBkH,EAAO,KAAMA,EAAO,YAAY,EAE9DlH,EAAK,SAAS,WAAWkH,EAAO,IAAI,CAE/C,EAiBAlH,EAAK,SAAS,gBAAkB,SAAU4B,EAAKuF,EAAc,CAS3D,QARIC,EAAO,IAAIpH,EAAK,SAEhBqH,EAAQ,CAAC,CACX,KAAMD,EACN,eAAgBD,EAChB,IAAKvF,CACP,CAAC,EAEMyF,EAAM,QAAQ,CACnB,IAAIC,EAAQD,EAAM,IAAI,EAGtB,GAAIC,EAAM,IAAI,OAAS,EAAG,CACxB,IAAIlF,EAAOkF,EAAM,IAAI,OAAO,CAAC,EACzBC,EAEAnF,KAAQkF,EAAM,KAAK,MACrBC,EAAaD,EAAM,KAAK,MAAMlF,CAAI,GAElCmF,EAAa,IAAIvH,EAAK,SACtBsH,EAAM,KAAK,MAAMlF,CAAI,EAAImF,GAGvBD,EAAM,IAAI,QAAU,IACtBC,EAAW,MAAQ,IAGrBF,EAAM,KAAK,CACT,KAAME,EACN,eAAgBD,EAAM,eACtB,IAAKA,EAAM,IAAI,MAAM,CAAC,CACxB,CAAC,CACH,CAEA,GAAIA,EAAM,gBAAkB,EAK5B,IAAI,MAAOA,EAAM,KAAK,MACpB,IAAIE,EAAgBF,EAAM,KAAK,MAAM,GAAG,MACnC,CACL,IAAIE,EAAgB,IAAIxH,EAAK,SAC7BsH,EAAM,KAAK,MAAM,GAAG,EAAIE,CAC1B,CAgCA,GA9BIF,EAAM,IAAI,QAAU,IACtBE,EAAc,MAAQ,IAGxBH,EAAM,KAAK,CACT,KAAMG,EACN,eAAgBF,EAAM,eAAiB,EACvC,IAAKA,EAAM,GACb,CAAC,EAKGA,EAAM,IAAI,OAAS,GACrBD,EAAM,KAAK,CACT,KAAMC,EAAM,KACZ,eAAgBA,EAAM,eAAiB,EACvC,IAAKA,EAAM,IAAI,MAAM,CAAC,CACxB,CAAC,EAKCA,EAAM,IAAI,QAAU,IACtBA,EAAM,KAAK,MAAQ,IAMjBA,EAAM,IAAI,QAAU,EAAG,CACzB,GAAI,MAAOA,EAAM,KAAK,MACpB,IAAIG,EAAmBH,EAAM,KAAK,MAAM,GAAG,MACtC,CACL,IAAIG,EAAmB,IAAIzH,EAAK,SAChCsH,EAAM,KAAK,MAAM,GAAG,EAAIG,CAC1B,CAEIH,EAAM,IAAI,QAAU,IACtBG,EAAiB,MAAQ,IAG3BJ,EAAM,KAAK,CACT,KAAMI,EACN,eAAgBH,EAAM,eAAiB,EACvC,IAAKA,EAAM,IAAI,MAAM,CAAC,CACxB,CAAC,CACH,CAKA,GAAIA,EAAM,IAAI,OAAS,EAAG,CACxB,IAAII,EAAQJ,EAAM,IAAI,OAAO,CAAC,EAC1BK,EAAQL,EAAM,IAAI,OAAO,CAAC,EAC1BM,EAEAD,KAASL,EAAM,KAAK,MACtBM,EAAgBN,EAAM,KAAK,MAAMK,CAAK,GAEtCC,EAAgB,IAAI5H,EAAK,SACzBsH,EAAM,KAAK,MAAMK,CAAK,EAAIC,GAGxBN,EAAM,IAAI,QAAU,IACtBM,EAAc,MAAQ,IAGxBP,EAAM,KAAK,CACT,KAAMO,EACN,eAAgBN,EAAM,eAAiB,EACvC,IAAKI,EAAQJ,EAAM,IAAI,MAAM,CAAC,CAChC,CAAC,CACH,EACF,CAEA,OAAOF,CACT,EAYApH,EAAK,SAAS,WAAa,SAAU4B,EAAK,CAYxC,QAXIiG,EAAO,IAAI7H,EAAK,SAChBoH,EAAOS,EAUF,EAAI,EAAG7F,EAAMJ,EAAI,OAAQ,EAAII,EAAK,IAAK,CAC9C,IAAII,EAAOR,EAAI,CAAC,EACZkG,EAAS,GAAK9F,EAAM,EAExB,GAAII,GAAQ,IACVyF,EAAK,MAAMzF,CAAI,EAAIyF,EACnBA,EAAK,MAAQC,MAER,CACL,IAAIC,EAAO,IAAI/H,EAAK,SACpB+H,EAAK,MAAQD,EAEbD,EAAK,MAAMzF,CAAI,EAAI2F,EACnBF,EAAOE,CACT,CACF,CAEA,OAAOX,CACT,EAYApH,EAAK,SAAS,UAAU,QAAU,UAAY,CAQ5C,QAPI+G,EAAQ,CAAC,EAETM,EAAQ,CAAC,CACX,OAAQ,GACR,KAAM,IACR,CAAC,EAEMA,EAAM,QAAQ,CACnB,IAAIC,EAAQD,EAAM,IAAI,EAClBW,EAAQ,OAAO,KAAKV,EAAM,KAAK,KAAK,EACpCtF,EAAMgG,EAAM,OAEZV,EAAM,KAAK,QAKbA,EAAM,OAAO,OAAO,CAAC,EACrBP,EAAM,KAAKO,EAAM,MAAM,GAGzB,QAASrG,EAAI,EAAGA,EAAIe,EAAKf,IAAK,CAC5B,IAAIgH,EAAOD,EAAM/G,CAAC,EAElBoG,EAAM,KAAK,CACT,OAAQC,EAAM,OAAO,OAAOW,CAAI,EAChC,KAAMX,EAAM,KAAK,MAAMW,CAAI,CAC7B,CAAC,CACH,CACF,CAEA,OAAOlB,CACT,EAYA/G,EAAK,SAAS,UAAU,SAAW,UAAY,CAS7C,GAAI,KAAK,KACP,OAAO,KAAK,KAOd,QAJI4B,EAAM,KAAK,MAAQ,IAAM,IACzBsG,EAAS,OAAO,KAAK,KAAK,KAAK,EAAE,KAAK,EACtClG,EAAMkG,EAAO,OAER,EAAI,EAAG,EAAIlG,EAAK,IAAK,CAC5B,IAAIO,EAAQ2F,EAAO,CAAC,EAChBL,EAAO,KAAK,MAAMtF,CAAK,EAE3BX,EAAMA,EAAMW,EAAQsF,EAAK,EAC3B,CAEA,OAAOjG,CACT,EAYA5B,EAAK,SAAS,UAAU,UAAY,SAAUqB,EAAG,CAU/C,QATIgD,EAAS,IAAIrE,EAAK,SAClBsH,EAAQ,OAERD,EAAQ,CAAC,CACX,MAAOhG,EACP,OAAQgD,EACR,KAAM,IACR,CAAC,EAEMgD,EAAM,QAAQ,CACnBC,EAAQD,EAAM,IAAI,EAWlB,QALIc,EAAS,OAAO,KAAKb,EAAM,MAAM,KAAK,EACtCc,EAAOD,EAAO,OACdE,EAAS,OAAO,KAAKf,EAAM,KAAK,KAAK,EACrCgB,EAAOD,EAAO,OAETE,EAAI,EAAGA,EAAIH,EAAMG,IAGxB,QAFIC,EAAQL,EAAOI,CAAC,EAEXzH,EAAI,EAAGA,EAAIwH,EAAMxH,IAAK,CAC7B,IAAI2H,EAAQJ,EAAOvH,CAAC,EAEpB,GAAI2H,GAASD,GAASA,GAAS,IAAK,CAClC,IAAIX,EAAOP,EAAM,KAAK,MAAMmB,CAAK,EAC7BC,EAAQpB,EAAM,MAAM,MAAMkB,CAAK,EAC/BV,EAAQD,EAAK,OAASa,EAAM,MAC5BX,EAAO,OAEPU,KAASnB,EAAM,OAAO,OAIxBS,EAAOT,EAAM,OAAO,MAAMmB,CAAK,EAC/BV,EAAK,MAAQA,EAAK,OAASD,IAM3BC,EAAO,IAAI/H,EAAK,SAChB+H,EAAK,MAAQD,EACbR,EAAM,OAAO,MAAMmB,CAAK,EAAIV,GAG9BV,EAAM,KAAK,CACT,MAAOqB,EACP,OAAQX,EACR,KAAMF,CACR,CAAC,CACH,CACF,CAEJ,CAEA,OAAOxD,CACT,EACArE,EAAK,SAAS,QAAU,UAAY,CAClC,KAAK,aAAe,GACpB,KAAK,KAAO,IAAIA,EAAK,SACrB,KAAK,eAAiB,CAAC,EACvB,KAAK,eAAiB,CAAC,CACzB,EAEAA,EAAK,SAAS,QAAQ,UAAU,OAAS,SAAU2I,EAAM,CACvD,IAAId,EACAe,EAAe,EAEnB,GAAID,EAAO,KAAK,aACd,MAAM,IAAI,MAAO,6BAA6B,EAGhD,QAAS,EAAI,EAAG,EAAIA,EAAK,QAAU,EAAI,KAAK,aAAa,QACnDA,EAAK,CAAC,GAAK,KAAK,aAAa,CAAC,EAD6B,IAE/DC,IAGF,KAAK,SAASA,CAAY,EAEtB,KAAK,eAAe,QAAU,EAChCf,EAAO,KAAK,KAEZA,EAAO,KAAK,eAAe,KAAK,eAAe,OAAS,CAAC,EAAE,MAG7D,QAAS,EAAIe,EAAc,EAAID,EAAK,OAAQ,IAAK,CAC/C,IAAIE,EAAW,IAAI7I,EAAK,SACpBoC,EAAOuG,EAAK,CAAC,EAEjBd,EAAK,MAAMzF,CAAI,EAAIyG,EAEnB,KAAK,eAAe,KAAK,CACvB,OAAQhB,EACR,KAAMzF,EACN,MAAOyG,CACT,CAAC,EAEDhB,EAAOgB,CACT,CAEAhB,EAAK,MAAQ,GACb,KAAK,aAAec,CACtB,EAEA3I,EAAK,SAAS,QAAQ,UAAU,OAAS,UAAY,CACnD,KAAK,SAAS,CAAC,CACjB,EAEAA,EAAK,SAAS,QAAQ,UAAU,SAAW,SAAU8I,EAAQ,CAC3D,QAAS7H,EAAI,KAAK,eAAe,OAAS,EAAGA,GAAK6H,EAAQ7H,IAAK,CAC7D,IAAI4G,EAAO,KAAK,eAAe5G,CAAC,EAC5B8H,EAAWlB,EAAK,MAAM,SAAS,EAE/BkB,KAAY,KAAK,eACnBlB,EAAK,OAAO,MAAMA,EAAK,IAAI,EAAI,KAAK,eAAekB,CAAQ,GAI3DlB,EAAK,MAAM,KAAOkB,EAElB,KAAK,eAAeA,CAAQ,EAAIlB,EAAK,OAGvC,KAAK,eAAe,IAAI,CAC1B,CACF,EACA;AAAA;AAAA;AAAA,GAqBA7H,EAAK,MAAQ,SAAUgJ,EAAO,CAC5B,KAAK,cAAgBA,EAAM,cAC3B,KAAK,aAAeA,EAAM,aAC1B,KAAK,SAAWA,EAAM,SACtB,KAAK,OAASA,EAAM,OACpB,KAAK,SAAWA,EAAM,QACxB,EAyEAhJ,EAAK,MAAM,UAAU,OAAS,SAAUiJ,EAAa,CACnD,OAAO,KAAK,MAAM,SAAUC,EAAO,CACjC,IAAIC,EAAS,IAAInJ,EAAK,YAAYiJ,EAAaC,CAAK,EACpDC,EAAO,MAAM,CACf,CAAC,CACH,EA2BAnJ,EAAK,MAAM,UAAU,MAAQ,SAAU8B,EAAI,CAoBzC,QAZIoH,EAAQ,IAAIlJ,EAAK,MAAM,KAAK,MAAM,EAClCoJ,EAAiB,OAAO,OAAO,IAAI,EACnCC,EAAe,OAAO,OAAO,IAAI,EACjCC,EAAiB,OAAO,OAAO,IAAI,EACnCC,EAAkB,OAAO,OAAO,IAAI,EACpCC,EAAoB,OAAO,OAAO,IAAI,EAOjCvI,EAAI,EAAGA,EAAI,KAAK,OAAO,OAAQA,IACtCoI,EAAa,KAAK,OAAOpI,CAAC,CAAC,EAAI,IAAIjB,EAAK,OAG1C8B,EAAG,KAAKoH,EAAOA,CAAK,EAEpB,QAASjI,EAAI,EAAGA,EAAIiI,EAAM,QAAQ,OAAQjI,IAAK,CAS7C,IAAIiG,EAASgC,EAAM,QAAQjI,CAAC,EACxBwI,EAAQ,KACRC,EAAgB1J,EAAK,IAAI,MAEzBkH,EAAO,YACTuC,EAAQ,KAAK,SAAS,UAAUvC,EAAO,KAAM,CAC3C,OAAQA,EAAO,MACjB,CAAC,EAEDuC,EAAQ,CAACvC,EAAO,IAAI,EAGtB,QAASyC,EAAI,EAAGA,EAAIF,EAAM,OAAQE,IAAK,CACrC,IAAIC,EAAOH,EAAME,CAAC,EAQlBzC,EAAO,KAAO0C,EAOd,IAAIC,EAAe7J,EAAK,SAAS,WAAWkH,CAAM,EAC9C4C,EAAgB,KAAK,SAAS,UAAUD,CAAY,EAAE,QAAQ,EAQlE,GAAIC,EAAc,SAAW,GAAK5C,EAAO,WAAalH,EAAK,MAAM,SAAS,SAAU,CAClF,QAASoD,EAAI,EAAGA,EAAI8D,EAAO,OAAO,OAAQ9D,IAAK,CAC7C,IAAI2G,EAAQ7C,EAAO,OAAO9D,CAAC,EAC3BmG,EAAgBQ,CAAK,EAAI/J,EAAK,IAAI,KACpC,CAEA,KACF,CAEA,QAASkD,EAAI,EAAGA,EAAI4G,EAAc,OAAQ5G,IASxC,QAJI8G,EAAeF,EAAc5G,CAAC,EAC9B1B,EAAU,KAAK,cAAcwI,CAAY,EACzCC,EAAYzI,EAAQ,OAEf4B,EAAI,EAAGA,EAAI8D,EAAO,OAAO,OAAQ9D,IAAK,CAS7C,IAAI2G,EAAQ7C,EAAO,OAAO9D,CAAC,EACvB8G,EAAe1I,EAAQuI,CAAK,EAC5BI,EAAuB,OAAO,KAAKD,CAAY,EAC/CE,EAAYJ,EAAe,IAAMD,EACjCM,EAAuB,IAAIrK,EAAK,IAAImK,CAAoB,EAoB5D,GAbIjD,EAAO,UAAYlH,EAAK,MAAM,SAAS,WACzC0J,EAAgBA,EAAc,MAAMW,CAAoB,EAEpDd,EAAgBQ,CAAK,IAAM,SAC7BR,EAAgBQ,CAAK,EAAI/J,EAAK,IAAI,WASlCkH,EAAO,UAAYlH,EAAK,MAAM,SAAS,WAAY,CACjDwJ,EAAkBO,CAAK,IAAM,SAC/BP,EAAkBO,CAAK,EAAI/J,EAAK,IAAI,OAGtCwJ,EAAkBO,CAAK,EAAIP,EAAkBO,CAAK,EAAE,MAAMM,CAAoB,EAO9E,QACF,CAeA,GANAhB,EAAaU,CAAK,EAAE,OAAOE,EAAW/C,EAAO,MAAO,SAAU9F,GAAGC,GAAG,CAAE,OAAOD,GAAIC,EAAE,CAAC,EAMhF,CAAAiI,EAAec,CAAS,EAI5B,SAASE,EAAI,EAAGA,EAAIH,EAAqB,OAAQG,IAAK,CAOpD,IAAIC,EAAsBJ,EAAqBG,CAAC,EAC5CE,EAAmB,IAAIxK,EAAK,SAAUuK,EAAqBR,CAAK,EAChElI,EAAWqI,EAAaK,CAAmB,EAC3CE,GAECA,EAAarB,EAAeoB,CAAgB,KAAO,OACtDpB,EAAeoB,CAAgB,EAAI,IAAIxK,EAAK,UAAWgK,EAAcD,EAAOlI,CAAQ,EAEpF4I,EAAW,IAAIT,EAAcD,EAAOlI,CAAQ,CAGhD,CAEAyH,EAAec,CAAS,EAAI,GAC9B,CAEJ,CAQA,GAAIlD,EAAO,WAAalH,EAAK,MAAM,SAAS,SAC1C,QAASoD,EAAI,EAAGA,EAAI8D,EAAO,OAAO,OAAQ9D,IAAK,CAC7C,IAAI2G,EAAQ7C,EAAO,OAAO9D,CAAC,EAC3BmG,EAAgBQ,CAAK,EAAIR,EAAgBQ,CAAK,EAAE,UAAUL,CAAa,CACzE,CAEJ,CAUA,QAHIgB,EAAqB1K,EAAK,IAAI,SAC9B2K,EAAuB3K,EAAK,IAAI,MAE3BiB,EAAI,EAAGA,EAAI,KAAK,OAAO,OAAQA,IAAK,CAC3C,IAAI8I,EAAQ,KAAK,OAAO9I,CAAC,EAErBsI,EAAgBQ,CAAK,IACvBW,EAAqBA,EAAmB,UAAUnB,EAAgBQ,CAAK,CAAC,GAGtEP,EAAkBO,CAAK,IACzBY,EAAuBA,EAAqB,MAAMnB,EAAkBO,CAAK,CAAC,EAE9E,CAEA,IAAIa,EAAoB,OAAO,KAAKxB,CAAc,EAC9CyB,EAAU,CAAC,EACXC,EAAU,OAAO,OAAO,IAAI,EAYhC,GAAI5B,EAAM,UAAU,EAAG,CACrB0B,EAAoB,OAAO,KAAK,KAAK,YAAY,EAEjD,QAAS3J,EAAI,EAAGA,EAAI2J,EAAkB,OAAQ3J,IAAK,CACjD,IAAIuJ,EAAmBI,EAAkB3J,CAAC,EACtCF,EAAWf,EAAK,SAAS,WAAWwK,CAAgB,EACxDpB,EAAeoB,CAAgB,EAAI,IAAIxK,EAAK,SAC9C,CACF,CAEA,QAASiB,EAAI,EAAGA,EAAI2J,EAAkB,OAAQ3J,IAAK,CASjD,IAAIF,EAAWf,EAAK,SAAS,WAAW4K,EAAkB3J,CAAC,CAAC,EACxDP,EAASK,EAAS,OAEtB,GAAK2J,EAAmB,SAAShK,CAAM,GAInC,CAAAiK,EAAqB,SAASjK,CAAM,EAIxC,KAAIqK,EAAc,KAAK,aAAahK,CAAQ,EACxCiK,EAAQ3B,EAAatI,EAAS,SAAS,EAAE,WAAWgK,CAAW,EAC/DE,EAEJ,IAAKA,EAAWH,EAAQpK,CAAM,KAAO,OACnCuK,EAAS,OAASD,EAClBC,EAAS,UAAU,QAAQ7B,EAAerI,CAAQ,CAAC,MAC9C,CACL,IAAImK,EAAQ,CACV,IAAKxK,EACL,MAAOsK,EACP,UAAW5B,EAAerI,CAAQ,CACpC,EACA+J,EAAQpK,CAAM,EAAIwK,EAClBL,EAAQ,KAAKK,CAAK,CACpB,EACF,CAKA,OAAOL,EAAQ,KAAK,SAAUzJ,GAAGC,GAAG,CAClC,OAAOA,GAAE,MAAQD,GAAE,KACrB,CAAC,CACH,EAUApB,EAAK,MAAM,UAAU,OAAS,UAAY,CACxC,IAAImL,EAAgB,OAAO,KAAK,KAAK,aAAa,EAC/C,KAAK,EACL,IAAI,SAAUvB,EAAM,CACnB,MAAO,CAACA,EAAM,KAAK,cAAcA,CAAI,CAAC,CACxC,EAAG,IAAI,EAELwB,EAAe,OAAO,KAAK,KAAK,YAAY,EAC7C,IAAI,SAAUC,EAAK,CAClB,MAAO,CAACA,EAAK,KAAK,aAAaA,CAAG,EAAE,OAAO,CAAC,CAC9C,EAAG,IAAI,EAET,MAAO,CACL,QAASrL,EAAK,QACd,OAAQ,KAAK,OACb,aAAcoL,EACd,cAAeD,EACf,SAAU,KAAK,SAAS,OAAO,CACjC,CACF,EAQAnL,EAAK,MAAM,KAAO,SAAUsL,EAAiB,CAC3C,IAAItC,EAAQ,CAAC,EACToC,EAAe,CAAC,EAChBG,EAAoBD,EAAgB,aACpCH,EAAgB,OAAO,OAAO,IAAI,EAClCK,EAA0BF,EAAgB,cAC1CG,EAAkB,IAAIzL,EAAK,SAAS,QACpC0C,EAAW1C,EAAK,SAAS,KAAKsL,EAAgB,QAAQ,EAEtDA,EAAgB,SAAWtL,EAAK,SAClCA,EAAK,MAAM,KAAK,4EAA8EA,EAAK,QAAU,sCAAwCsL,EAAgB,QAAU,GAAG,EAGpL,QAASrK,EAAI,EAAGA,EAAIsK,EAAkB,OAAQtK,IAAK,CACjD,IAAIyK,EAAQH,EAAkBtK,CAAC,EAC3BoK,EAAMK,EAAM,CAAC,EACb1K,EAAW0K,EAAM,CAAC,EAEtBN,EAAaC,CAAG,EAAI,IAAIrL,EAAK,OAAOgB,CAAQ,CAC9C,CAEA,QAASC,EAAI,EAAGA,EAAIuK,EAAwB,OAAQvK,IAAK,CACvD,IAAIyK,EAAQF,EAAwBvK,CAAC,EACjC2I,EAAO8B,EAAM,CAAC,EACdlK,EAAUkK,EAAM,CAAC,EAErBD,EAAgB,OAAO7B,CAAI,EAC3BuB,EAAcvB,CAAI,EAAIpI,CACxB,CAEA,OAAAiK,EAAgB,OAAO,EAEvBzC,EAAM,OAASsC,EAAgB,OAE/BtC,EAAM,aAAeoC,EACrBpC,EAAM,cAAgBmC,EACtBnC,EAAM,SAAWyC,EAAgB,KACjCzC,EAAM,SAAWtG,EAEV,IAAI1C,EAAK,MAAMgJ,CAAK,CAC7B,EACA;AAAA;AAAA;AAAA,GA6BAhJ,EAAK,QAAU,UAAY,CACzB,KAAK,KAAO,KACZ,KAAK,QAAU,OAAO,OAAO,IAAI,EACjC,KAAK,WAAa,OAAO,OAAO,IAAI,EACpC,KAAK,cAAgB,OAAO,OAAO,IAAI,EACvC,KAAK,qBAAuB,CAAC,EAC7B,KAAK,aAAe,CAAC,EACrB,KAAK,UAAYA,EAAK,UACtB,KAAK,SAAW,IAAIA,EAAK,SACzB,KAAK,eAAiB,IAAIA,EAAK,SAC/B,KAAK,cAAgB,EACrB,KAAK,GAAK,IACV,KAAK,IAAM,IACX,KAAK,UAAY,EACjB,KAAK,kBAAoB,CAAC,CAC5B,EAcAA,EAAK,QAAQ,UAAU,IAAM,SAAUqL,EAAK,CAC1C,KAAK,KAAOA,CACd,EAkCArL,EAAK,QAAQ,UAAU,MAAQ,SAAUW,EAAWgL,EAAY,CAC9D,GAAI,KAAK,KAAKhL,CAAS,EACrB,MAAM,IAAI,WAAY,UAAYA,EAAY,kCAAkC,EAGlF,KAAK,QAAQA,CAAS,EAAIgL,GAAc,CAAC,CAC3C,EAUA3L,EAAK,QAAQ,UAAU,EAAI,SAAU4L,EAAQ,CACvCA,EAAS,EACX,KAAK,GAAK,EACDA,EAAS,EAClB,KAAK,GAAK,EAEV,KAAK,GAAKA,CAEd,EASA5L,EAAK,QAAQ,UAAU,GAAK,SAAU4L,EAAQ,CAC5C,KAAK,IAAMA,CACb,EAmBA5L,EAAK,QAAQ,UAAU,IAAM,SAAU6L,EAAKF,EAAY,CACtD,IAAIjL,EAASmL,EAAI,KAAK,IAAI,EACtBC,EAAS,OAAO,KAAK,KAAK,OAAO,EAErC,KAAK,WAAWpL,CAAM,EAAIiL,GAAc,CAAC,EACzC,KAAK,eAAiB,EAEtB,QAAS1K,EAAI,EAAGA,EAAI6K,EAAO,OAAQ7K,IAAK,CACtC,IAAIN,EAAYmL,EAAO7K,CAAC,EACpB8K,EAAY,KAAK,QAAQpL,CAAS,EAAE,UACpCoJ,EAAQgC,EAAYA,EAAUF,CAAG,EAAIA,EAAIlL,CAAS,EAClDsB,EAAS,KAAK,UAAU8H,EAAO,CAC7B,OAAQ,CAACpJ,CAAS,CACpB,CAAC,EACD8I,EAAQ,KAAK,SAAS,IAAIxH,CAAM,EAChClB,EAAW,IAAIf,EAAK,SAAUU,EAAQC,CAAS,EAC/CqL,EAAa,OAAO,OAAO,IAAI,EAEnC,KAAK,qBAAqBjL,CAAQ,EAAIiL,EACtC,KAAK,aAAajL,CAAQ,EAAI,EAG9B,KAAK,aAAaA,CAAQ,GAAK0I,EAAM,OAGrC,QAASvG,EAAI,EAAGA,EAAIuG,EAAM,OAAQvG,IAAK,CACrC,IAAI0G,EAAOH,EAAMvG,CAAC,EAUlB,GARI8I,EAAWpC,CAAI,GAAK,OACtBoC,EAAWpC,CAAI,EAAI,GAGrBoC,EAAWpC,CAAI,GAAK,EAIhB,KAAK,cAAcA,CAAI,GAAK,KAAW,CACzC,IAAIpI,EAAU,OAAO,OAAO,IAAI,EAChCA,EAAQ,OAAY,KAAK,UACzB,KAAK,WAAa,EAElB,QAAS4B,EAAI,EAAGA,EAAI0I,EAAO,OAAQ1I,IACjC5B,EAAQsK,EAAO1I,CAAC,CAAC,EAAI,OAAO,OAAO,IAAI,EAGzC,KAAK,cAAcwG,CAAI,EAAIpI,CAC7B,CAGI,KAAK,cAAcoI,CAAI,EAAEjJ,CAAS,EAAED,CAAM,GAAK,OACjD,KAAK,cAAckJ,CAAI,EAAEjJ,CAAS,EAAED,CAAM,EAAI,OAAO,OAAO,IAAI,GAKlE,QAAS4J,EAAI,EAAGA,EAAI,KAAK,kBAAkB,OAAQA,IAAK,CACtD,IAAI2B,EAAc,KAAK,kBAAkB3B,CAAC,EACtCzI,EAAW+H,EAAK,SAASqC,CAAW,EAEpC,KAAK,cAAcrC,CAAI,EAAEjJ,CAAS,EAAED,CAAM,EAAEuL,CAAW,GAAK,OAC9D,KAAK,cAAcrC,CAAI,EAAEjJ,CAAS,EAAED,CAAM,EAAEuL,CAAW,EAAI,CAAC,GAG9D,KAAK,cAAcrC,CAAI,EAAEjJ,CAAS,EAAED,CAAM,EAAEuL,CAAW,EAAE,KAAKpK,CAAQ,CACxE,CACF,CAEF,CACF,EAOA7B,EAAK,QAAQ,UAAU,6BAA+B,UAAY,CAOhE,QALIkM,EAAY,OAAO,KAAK,KAAK,YAAY,EACzCC,EAAiBD,EAAU,OAC3BE,EAAc,CAAC,EACfC,EAAqB,CAAC,EAEjBpL,EAAI,EAAGA,EAAIkL,EAAgBlL,IAAK,CACvC,IAAIF,EAAWf,EAAK,SAAS,WAAWkM,EAAUjL,CAAC,CAAC,EAChD8I,EAAQhJ,EAAS,UAErBsL,EAAmBtC,CAAK,IAAMsC,EAAmBtC,CAAK,EAAI,GAC1DsC,EAAmBtC,CAAK,GAAK,EAE7BqC,EAAYrC,CAAK,IAAMqC,EAAYrC,CAAK,EAAI,GAC5CqC,EAAYrC,CAAK,GAAK,KAAK,aAAahJ,CAAQ,CAClD,CAIA,QAFI+K,EAAS,OAAO,KAAK,KAAK,OAAO,EAE5B7K,EAAI,EAAGA,EAAI6K,EAAO,OAAQ7K,IAAK,CACtC,IAAIN,EAAYmL,EAAO7K,CAAC,EACxBmL,EAAYzL,CAAS,EAAIyL,EAAYzL,CAAS,EAAI0L,EAAmB1L,CAAS,CAChF,CAEA,KAAK,mBAAqByL,CAC5B,EAOApM,EAAK,QAAQ,UAAU,mBAAqB,UAAY,CAMtD,QALIoL,EAAe,CAAC,EAChBc,EAAY,OAAO,KAAK,KAAK,oBAAoB,EACjDI,EAAkBJ,EAAU,OAC5BK,EAAe,OAAO,OAAO,IAAI,EAE5BtL,EAAI,EAAGA,EAAIqL,EAAiBrL,IAAK,CAaxC,QAZIF,EAAWf,EAAK,SAAS,WAAWkM,EAAUjL,CAAC,CAAC,EAChDN,EAAYI,EAAS,UACrByL,EAAc,KAAK,aAAazL,CAAQ,EACxCgK,EAAc,IAAI/K,EAAK,OACvByM,EAAkB,KAAK,qBAAqB1L,CAAQ,EACpD0I,EAAQ,OAAO,KAAKgD,CAAe,EACnCC,EAAcjD,EAAM,OAGpBkD,EAAa,KAAK,QAAQhM,CAAS,EAAE,OAAS,EAC9CiM,EAAW,KAAK,WAAW7L,EAAS,MAAM,EAAE,OAAS,EAEhDmC,EAAI,EAAGA,EAAIwJ,EAAaxJ,IAAK,CACpC,IAAI0G,EAAOH,EAAMvG,CAAC,EACd2J,EAAKJ,EAAgB7C,CAAI,EACzBK,EAAY,KAAK,cAAcL,CAAI,EAAE,OACrCkD,EAAK9B,EAAO+B,EAEZR,EAAa3C,CAAI,IAAM,QACzBkD,EAAM9M,EAAK,IAAI,KAAK,cAAc4J,CAAI,EAAG,KAAK,aAAa,EAC3D2C,EAAa3C,CAAI,EAAIkD,GAErBA,EAAMP,EAAa3C,CAAI,EAGzBoB,EAAQ8B,IAAQ,KAAK,IAAM,GAAKD,IAAO,KAAK,KAAO,EAAI,KAAK,GAAK,KAAK,IAAML,EAAc,KAAK,mBAAmB7L,CAAS,IAAMkM,GACjI7B,GAAS2B,EACT3B,GAAS4B,EACTG,EAAqB,KAAK,MAAM/B,EAAQ,GAAI,EAAI,IAQhDD,EAAY,OAAOd,EAAW8C,CAAkB,CAClD,CAEA3B,EAAarK,CAAQ,EAAIgK,CAC3B,CAEA,KAAK,aAAeK,CACtB,EAOApL,EAAK,QAAQ,UAAU,eAAiB,UAAY,CAClD,KAAK,SAAWA,EAAK,SAAS,UAC5B,OAAO,KAAK,KAAK,aAAa,EAAE,KAAK,CACvC,CACF,EAUAA,EAAK,QAAQ,UAAU,MAAQ,UAAY,CACzC,YAAK,6BAA6B,EAClC,KAAK,mBAAmB,EACxB,KAAK,eAAe,EAEb,IAAIA,EAAK,MAAM,CACpB,cAAe,KAAK,cACpB,aAAc,KAAK,aACnB,SAAU,KAAK,SACf,OAAQ,OAAO,KAAK,KAAK,OAAO,EAChC,SAAU,KAAK,cACjB,CAAC,CACH,EAgBAA,EAAK,QAAQ,UAAU,IAAM,SAAU8B,EAAI,CACzC,IAAIkL,EAAO,MAAM,UAAU,MAAM,KAAK,UAAW,CAAC,EAClDA,EAAK,QAAQ,IAAI,EACjBlL,EAAG,MAAM,KAAMkL,CAAI,CACrB,EAaAhN,EAAK,UAAY,SAAU4J,EAAMG,EAAOlI,EAAU,CAShD,QARIoL,EAAiB,OAAO,OAAO,IAAI,EACnCC,EAAe,OAAO,KAAKrL,GAAY,CAAC,CAAC,EAOpCZ,EAAI,EAAGA,EAAIiM,EAAa,OAAQjM,IAAK,CAC5C,IAAIT,EAAM0M,EAAajM,CAAC,EACxBgM,EAAezM,CAAG,EAAIqB,EAASrB,CAAG,EAAE,MAAM,CAC5C,CAEA,KAAK,SAAW,OAAO,OAAO,IAAI,EAE9BoJ,IAAS,SACX,KAAK,SAASA,CAAI,EAAI,OAAO,OAAO,IAAI,EACxC,KAAK,SAASA,CAAI,EAAEG,CAAK,EAAIkD,EAEjC,EAWAjN,EAAK,UAAU,UAAU,QAAU,SAAUmN,EAAgB,CAG3D,QAFI1D,EAAQ,OAAO,KAAK0D,EAAe,QAAQ,EAEtClM,EAAI,EAAGA,EAAIwI,EAAM,OAAQxI,IAAK,CACrC,IAAI2I,EAAOH,EAAMxI,CAAC,EACd6K,EAAS,OAAO,KAAKqB,EAAe,SAASvD,CAAI,CAAC,EAElD,KAAK,SAASA,CAAI,GAAK,OACzB,KAAK,SAASA,CAAI,EAAI,OAAO,OAAO,IAAI,GAG1C,QAAS1G,EAAI,EAAGA,EAAI4I,EAAO,OAAQ5I,IAAK,CACtC,IAAI6G,EAAQ+B,EAAO5I,CAAC,EAChB3C,EAAO,OAAO,KAAK4M,EAAe,SAASvD,CAAI,EAAEG,CAAK,CAAC,EAEvD,KAAK,SAASH,CAAI,EAAEG,CAAK,GAAK,OAChC,KAAK,SAASH,CAAI,EAAEG,CAAK,EAAI,OAAO,OAAO,IAAI,GAGjD,QAAS3G,EAAI,EAAGA,EAAI7C,EAAK,OAAQ6C,IAAK,CACpC,IAAI5C,EAAMD,EAAK6C,CAAC,EAEZ,KAAK,SAASwG,CAAI,EAAEG,CAAK,EAAEvJ,CAAG,GAAK,KACrC,KAAK,SAASoJ,CAAI,EAAEG,CAAK,EAAEvJ,CAAG,EAAI2M,EAAe,SAASvD,CAAI,EAAEG,CAAK,EAAEvJ,CAAG,EAE1E,KAAK,SAASoJ,CAAI,EAAEG,CAAK,EAAEvJ,CAAG,EAAI,KAAK,SAASoJ,CAAI,EAAEG,CAAK,EAAEvJ,CAAG,EAAE,OAAO2M,EAAe,SAASvD,CAAI,EAAEG,CAAK,EAAEvJ,CAAG,CAAC,CAGtH,CACF,CACF,CACF,EASAR,EAAK,UAAU,UAAU,IAAM,SAAU4J,EAAMG,EAAOlI,EAAU,CAC9D,GAAI,EAAE+H,KAAQ,KAAK,UAAW,CAC5B,KAAK,SAASA,CAAI,EAAI,OAAO,OAAO,IAAI,EACxC,KAAK,SAASA,CAAI,EAAEG,CAAK,EAAIlI,EAC7B,MACF,CAEA,GAAI,EAAEkI,KAAS,KAAK,SAASH,CAAI,GAAI,CACnC,KAAK,SAASA,CAAI,EAAEG,CAAK,EAAIlI,EAC7B,MACF,CAIA,QAFIqL,EAAe,OAAO,KAAKrL,CAAQ,EAE9BZ,EAAI,EAAGA,EAAIiM,EAAa,OAAQjM,IAAK,CAC5C,IAAIT,EAAM0M,EAAajM,CAAC,EAEpBT,KAAO,KAAK,SAASoJ,CAAI,EAAEG,CAAK,EAClC,KAAK,SAASH,CAAI,EAAEG,CAAK,EAAEvJ,CAAG,EAAI,KAAK,SAASoJ,CAAI,EAAEG,CAAK,EAAEvJ,CAAG,EAAE,OAAOqB,EAASrB,CAAG,CAAC,EAEtF,KAAK,SAASoJ,CAAI,EAAEG,CAAK,EAAEvJ,CAAG,EAAIqB,EAASrB,CAAG,CAElD,CACF,EAYAR,EAAK,MAAQ,SAAUoN,EAAW,CAChC,KAAK,QAAU,CAAC,EAChB,KAAK,UAAYA,CACnB,EA0BApN,EAAK,MAAM,SAAW,IAAI,OAAQ,GAAG,EACrCA,EAAK,MAAM,SAAS,KAAO,EAC3BA,EAAK,MAAM,SAAS,QAAU,EAC9BA,EAAK,MAAM,SAAS,SAAW,EAa/BA,EAAK,MAAM,SAAW,CAIpB,SAAU,EAMV,SAAU,EAMV,WAAY,CACd,EAyBAA,EAAK,MAAM,UAAU,OAAS,SAAUkH,EAAQ,CAC9C,MAAM,WAAYA,IAChBA,EAAO,OAAS,KAAK,WAGjB,UAAWA,IACfA,EAAO,MAAQ,GAGX,gBAAiBA,IACrBA,EAAO,YAAc,IAGjB,aAAcA,IAClBA,EAAO,SAAWlH,EAAK,MAAM,SAAS,MAGnCkH,EAAO,SAAWlH,EAAK,MAAM,SAAS,SAAakH,EAAO,KAAK,OAAO,CAAC,GAAKlH,EAAK,MAAM,WAC1FkH,EAAO,KAAO,IAAMA,EAAO,MAGxBA,EAAO,SAAWlH,EAAK,MAAM,SAAS,UAAckH,EAAO,KAAK,MAAM,EAAE,GAAKlH,EAAK,MAAM,WAC3FkH,EAAO,KAAO,GAAKA,EAAO,KAAO,KAG7B,aAAcA,IAClBA,EAAO,SAAWlH,EAAK,MAAM,SAAS,UAGxC,KAAK,QAAQ,KAAKkH,CAAM,EAEjB,IACT,EASAlH,EAAK,MAAM,UAAU,UAAY,UAAY,CAC3C,QAASiB,EAAI,EAAGA,EAAI,KAAK,QAAQ,OAAQA,IACvC,GAAI,KAAK,QAAQA,CAAC,EAAE,UAAYjB,EAAK,MAAM,SAAS,WAClD,MAAO,GAIX,MAAO,EACT,EA4BAA,EAAK,MAAM,UAAU,KAAO,SAAU4J,EAAMyD,EAAS,CACnD,GAAI,MAAM,QAAQzD,CAAI,EACpB,OAAAA,EAAK,QAAQ,SAAU7H,EAAG,CAAE,KAAK,KAAKA,EAAG/B,EAAK,MAAM,MAAMqN,CAAO,CAAC,CAAE,EAAG,IAAI,EACpE,KAGT,IAAInG,EAASmG,GAAW,CAAC,EACzB,OAAAnG,EAAO,KAAO0C,EAAK,SAAS,EAE5B,KAAK,OAAO1C,CAAM,EAEX,IACT,EACAlH,EAAK,gBAAkB,SAAUI,EAASmD,EAAOC,EAAK,CACpD,KAAK,KAAO,kBACZ,KAAK,QAAUpD,EACf,KAAK,MAAQmD,EACb,KAAK,IAAMC,CACb,EAEAxD,EAAK,gBAAgB,UAAY,IAAI,MACrCA,EAAK,WAAa,SAAU4B,EAAK,CAC/B,KAAK,QAAU,CAAC,EAChB,KAAK,IAAMA,EACX,KAAK,OAASA,EAAI,OAClB,KAAK,IAAM,EACX,KAAK,MAAQ,EACb,KAAK,oBAAsB,CAAC,CAC9B,EAEA5B,EAAK,WAAW,UAAU,IAAM,UAAY,CAG1C,QAFIsN,EAAQtN,EAAK,WAAW,QAErBsN,GACLA,EAAQA,EAAM,IAAI,CAEtB,EAEAtN,EAAK,WAAW,UAAU,YAAc,UAAY,CAKlD,QAJIuN,EAAY,CAAC,EACbpL,EAAa,KAAK,MAClBD,EAAW,KAAK,IAEX,EAAI,EAAG,EAAI,KAAK,oBAAoB,OAAQ,IACnDA,EAAW,KAAK,oBAAoB,CAAC,EACrCqL,EAAU,KAAK,KAAK,IAAI,MAAMpL,EAAYD,CAAQ,CAAC,EACnDC,EAAaD,EAAW,EAG1B,OAAAqL,EAAU,KAAK,KAAK,IAAI,MAAMpL,EAAY,KAAK,GAAG,CAAC,EACnD,KAAK,oBAAoB,OAAS,EAE3BoL,EAAU,KAAK,EAAE,CAC1B,EAEAvN,EAAK,WAAW,UAAU,KAAO,SAAUwN,EAAM,CAC/C,KAAK,QAAQ,KAAK,CAChB,KAAMA,EACN,IAAK,KAAK,YAAY,EACtB,MAAO,KAAK,MACZ,IAAK,KAAK,GACZ,CAAC,EAED,KAAK,MAAQ,KAAK,GACpB,EAEAxN,EAAK,WAAW,UAAU,gBAAkB,UAAY,CACtD,KAAK,oBAAoB,KAAK,KAAK,IAAM,CAAC,EAC1C,KAAK,KAAO,CACd,EAEAA,EAAK,WAAW,UAAU,KAAO,UAAY,CAC3C,GAAI,KAAK,KAAO,KAAK,OACnB,OAAOA,EAAK,WAAW,IAGzB,IAAIoC,EAAO,KAAK,IAAI,OAAO,KAAK,GAAG,EACnC,YAAK,KAAO,EACLA,CACT,EAEApC,EAAK,WAAW,UAAU,MAAQ,UAAY,CAC5C,OAAO,KAAK,IAAM,KAAK,KACzB,EAEAA,EAAK,WAAW,UAAU,OAAS,UAAY,CACzC,KAAK,OAAS,KAAK,MACrB,KAAK,KAAO,GAGd,KAAK,MAAQ,KAAK,GACpB,EAEAA,EAAK,WAAW,UAAU,OAAS,UAAY,CAC7C,KAAK,KAAO,CACd,EAEAA,EAAK,WAAW,UAAU,eAAiB,UAAY,CACrD,IAAIoC,EAAMqL,EAEV,GACErL,EAAO,KAAK,KAAK,EACjBqL,EAAWrL,EAAK,WAAW,CAAC,QACrBqL,EAAW,IAAMA,EAAW,IAEjCrL,GAAQpC,EAAK,WAAW,KAC1B,KAAK,OAAO,CAEhB,EAEAA,EAAK,WAAW,UAAU,KAAO,UAAY,CAC3C,OAAO,KAAK,IAAM,KAAK,MACzB,EAEAA,EAAK,WAAW,IAAM,MACtBA,EAAK,WAAW,MAAQ,QACxBA,EAAK,WAAW,KAAO,OACvBA,EAAK,WAAW,cAAgB,gBAChCA,EAAK,WAAW,MAAQ,QACxBA,EAAK,WAAW,SAAW,WAE3BA,EAAK,WAAW,SAAW,SAAU0N,EAAO,CAC1C,OAAAA,EAAM,OAAO,EACbA,EAAM,KAAK1N,EAAK,WAAW,KAAK,EAChC0N,EAAM,OAAO,EACN1N,EAAK,WAAW,OACzB,EAEAA,EAAK,WAAW,QAAU,SAAU0N,EAAO,CAQzC,GAPIA,EAAM,MAAM,EAAI,IAClBA,EAAM,OAAO,EACbA,EAAM,KAAK1N,EAAK,WAAW,IAAI,GAGjC0N,EAAM,OAAO,EAETA,EAAM,KAAK,EACb,OAAO1N,EAAK,WAAW,OAE3B,EAEAA,EAAK,WAAW,gBAAkB,SAAU0N,EAAO,CACjD,OAAAA,EAAM,OAAO,EACbA,EAAM,eAAe,EACrBA,EAAM,KAAK1N,EAAK,WAAW,aAAa,EACjCA,EAAK,WAAW,OACzB,EAEAA,EAAK,WAAW,SAAW,SAAU0N,EAAO,CAC1C,OAAAA,EAAM,OAAO,EACbA,EAAM,eAAe,EACrBA,EAAM,KAAK1N,EAAK,WAAW,KAAK,EACzBA,EAAK,WAAW,OACzB,EAEAA,EAAK,WAAW,OAAS,SAAU0N,EAAO,CACpCA,EAAM,MAAM,EAAI,GAClBA,EAAM,KAAK1N,EAAK,WAAW,IAAI,CAEnC,EAaAA,EAAK,WAAW,cAAgBA,EAAK,UAAU,UAE/CA,EAAK,WAAW,QAAU,SAAU0N,EAAO,CACzC,OAAa,CACX,IAAItL,EAAOsL,EAAM,KAAK,EAEtB,GAAItL,GAAQpC,EAAK,WAAW,IAC1B,OAAOA,EAAK,WAAW,OAIzB,GAAIoC,EAAK,WAAW,CAAC,GAAK,GAAI,CAC5BsL,EAAM,gBAAgB,EACtB,QACF,CAEA,GAAItL,GAAQ,IACV,OAAOpC,EAAK,WAAW,SAGzB,GAAIoC,GAAQ,IACV,OAAAsL,EAAM,OAAO,EACTA,EAAM,MAAM,EAAI,GAClBA,EAAM,KAAK1N,EAAK,WAAW,IAAI,EAE1BA,EAAK,WAAW,gBAGzB,GAAIoC,GAAQ,IACV,OAAAsL,EAAM,OAAO,EACTA,EAAM,MAAM,EAAI,GAClBA,EAAM,KAAK1N,EAAK,WAAW,IAAI,EAE1BA,EAAK,WAAW,SAczB,GARIoC,GAAQ,KAAOsL,EAAM,MAAM,IAAM,GAQjCtL,GAAQ,KAAOsL,EAAM,MAAM,IAAM,EACnC,OAAAA,EAAM,KAAK1N,EAAK,WAAW,QAAQ,EAC5BA,EAAK,WAAW,QAGzB,GAAIoC,EAAK,MAAMpC,EAAK,WAAW,aAAa,EAC1C,OAAOA,EAAK,WAAW,OAE3B,CACF,EAEAA,EAAK,YAAc,SAAU4B,EAAKsH,EAAO,CACvC,KAAK,MAAQ,IAAIlJ,EAAK,WAAY4B,CAAG,EACrC,KAAK,MAAQsH,EACb,KAAK,cAAgB,CAAC,EACtB,KAAK,UAAY,CACnB,EAEAlJ,EAAK,YAAY,UAAU,MAAQ,UAAY,CAC7C,KAAK,MAAM,IAAI,EACf,KAAK,QAAU,KAAK,MAAM,QAI1B,QAFIsN,EAAQtN,EAAK,YAAY,YAEtBsN,GACLA,EAAQA,EAAM,IAAI,EAGpB,OAAO,KAAK,KACd,EAEAtN,EAAK,YAAY,UAAU,WAAa,UAAY,CAClD,OAAO,KAAK,QAAQ,KAAK,SAAS,CACpC,EAEAA,EAAK,YAAY,UAAU,cAAgB,UAAY,CACrD,IAAI2N,EAAS,KAAK,WAAW,EAC7B,YAAK,WAAa,EACXA,CACT,EAEA3N,EAAK,YAAY,UAAU,WAAa,UAAY,CAClD,IAAI4N,EAAkB,KAAK,cAC3B,KAAK,MAAM,OAAOA,CAAe,EACjC,KAAK,cAAgB,CAAC,CACxB,EAEA5N,EAAK,YAAY,YAAc,SAAUmJ,EAAQ,CAC/C,IAAIwE,EAASxE,EAAO,WAAW,EAE/B,GAAIwE,GAAU,KAId,OAAQA,EAAO,KAAM,CACnB,KAAK3N,EAAK,WAAW,SACnB,OAAOA,EAAK,YAAY,cAC1B,KAAKA,EAAK,WAAW,MACnB,OAAOA,EAAK,YAAY,WAC1B,KAAKA,EAAK,WAAW,KACnB,OAAOA,EAAK,YAAY,UAC1B,QACE,IAAI6N,EAAe,4CAA8CF,EAAO,KAExE,MAAIA,EAAO,IAAI,QAAU,IACvBE,GAAgB,gBAAkBF,EAAO,IAAM,KAG3C,IAAI3N,EAAK,gBAAiB6N,EAAcF,EAAO,MAAOA,EAAO,GAAG,CAC1E,CACF,EAEA3N,EAAK,YAAY,cAAgB,SAAUmJ,EAAQ,CACjD,IAAIwE,EAASxE,EAAO,cAAc,EAElC,GAAIwE,GAAU,KAId,QAAQA,EAAO,IAAK,CAClB,IAAK,IACHxE,EAAO,cAAc,SAAWnJ,EAAK,MAAM,SAAS,WACpD,MACF,IAAK,IACHmJ,EAAO,cAAc,SAAWnJ,EAAK,MAAM,SAAS,SACpD,MACF,QACE,IAAI6N,EAAe,kCAAoCF,EAAO,IAAM,IACpE,MAAM,IAAI3N,EAAK,gBAAiB6N,EAAcF,EAAO,MAAOA,EAAO,GAAG,CAC1E,CAEA,IAAIG,EAAa3E,EAAO,WAAW,EAEnC,GAAI2E,GAAc,KAAW,CAC3B,IAAID,EAAe,yCACnB,MAAM,IAAI7N,EAAK,gBAAiB6N,EAAcF,EAAO,MAAOA,EAAO,GAAG,CACxE,CAEA,OAAQG,EAAW,KAAM,CACvB,KAAK9N,EAAK,WAAW,MACnB,OAAOA,EAAK,YAAY,WAC1B,KAAKA,EAAK,WAAW,KACnB,OAAOA,EAAK,YAAY,UAC1B,QACE,IAAI6N,EAAe,mCAAqCC,EAAW,KAAO,IAC1E,MAAM,IAAI9N,EAAK,gBAAiB6N,EAAcC,EAAW,MAAOA,EAAW,GAAG,CAClF,EACF,EAEA9N,EAAK,YAAY,WAAa,SAAUmJ,EAAQ,CAC9C,IAAIwE,EAASxE,EAAO,cAAc,EAElC,GAAIwE,GAAU,KAId,IAAIxE,EAAO,MAAM,UAAU,QAAQwE,EAAO,GAAG,GAAK,GAAI,CACpD,IAAII,EAAiB5E,EAAO,MAAM,UAAU,IAAI,SAAU6E,EAAG,CAAE,MAAO,IAAMA,EAAI,GAAI,CAAC,EAAE,KAAK,IAAI,EAC5FH,EAAe,uBAAyBF,EAAO,IAAM,uBAAyBI,EAElF,MAAM,IAAI/N,EAAK,gBAAiB6N,EAAcF,EAAO,MAAOA,EAAO,GAAG,CACxE,CAEAxE,EAAO,cAAc,OAAS,CAACwE,EAAO,GAAG,EAEzC,IAAIG,EAAa3E,EAAO,WAAW,EAEnC,GAAI2E,GAAc,KAAW,CAC3B,IAAID,EAAe,gCACnB,MAAM,IAAI7N,EAAK,gBAAiB6N,EAAcF,EAAO,MAAOA,EAAO,GAAG,CACxE,CAEA,OAAQG,EAAW,KAAM,CACvB,KAAK9N,EAAK,WAAW,KACnB,OAAOA,EAAK,YAAY,UAC1B,QACE,IAAI6N,EAAe,0BAA4BC,EAAW,KAAO,IACjE,MAAM,IAAI9N,EAAK,gBAAiB6N,EAAcC,EAAW,MAAOA,EAAW,GAAG,CAClF,EACF,EAEA9N,EAAK,YAAY,UAAY,SAAUmJ,EAAQ,CAC7C,IAAIwE,EAASxE,EAAO,cAAc,EAElC,GAAIwE,GAAU,KAId,CAAAxE,EAAO,cAAc,KAAOwE,EAAO,IAAI,YAAY,EAE/CA,EAAO,IAAI,QAAQ,GAAG,GAAK,KAC7BxE,EAAO,cAAc,YAAc,IAGrC,IAAI2E,EAAa3E,EAAO,WAAW,EAEnC,GAAI2E,GAAc,KAAW,CAC3B3E,EAAO,WAAW,EAClB,MACF,CAEA,OAAQ2E,EAAW,KAAM,CACvB,KAAK9N,EAAK,WAAW,KACnB,OAAAmJ,EAAO,WAAW,EACXnJ,EAAK,YAAY,UAC1B,KAAKA,EAAK,WAAW,MACnB,OAAAmJ,EAAO,WAAW,EACXnJ,EAAK,YAAY,WAC1B,KAAKA,EAAK,WAAW,cACnB,OAAOA,EAAK,YAAY,kBAC1B,KAAKA,EAAK,WAAW,MACnB,OAAOA,EAAK,YAAY,WAC1B,KAAKA,EAAK,WAAW,SACnB,OAAAmJ,EAAO,WAAW,EACXnJ,EAAK,YAAY,cAC1B,QACE,IAAI6N,EAAe,2BAA6BC,EAAW,KAAO,IAClE,MAAM,IAAI9N,EAAK,gBAAiB6N,EAAcC,EAAW,MAAOA,EAAW,GAAG,CAClF,EACF,EAEA9N,EAAK,YAAY,kBAAoB,SAAUmJ,EAAQ,CACrD,IAAIwE,EAASxE,EAAO,cAAc,EAElC,GAAIwE,GAAU,KAId,KAAIxG,EAAe,SAASwG,EAAO,IAAK,EAAE,EAE1C,GAAI,MAAMxG,CAAY,EAAG,CACvB,IAAI0G,EAAe,gCACnB,MAAM,IAAI7N,EAAK,gBAAiB6N,EAAcF,EAAO,MAAOA,EAAO,GAAG,CACxE,CAEAxE,EAAO,cAAc,aAAehC,EAEpC,IAAI2G,EAAa3E,EAAO,WAAW,EAEnC,GAAI2E,GAAc,KAAW,CAC3B3E,EAAO,WAAW,EAClB,MACF,CAEA,OAAQ2E,EAAW,KAAM,CACvB,KAAK9N,EAAK,WAAW,KACnB,OAAAmJ,EAAO,WAAW,EACXnJ,EAAK,YAAY,UAC1B,KAAKA,EAAK,WAAW,MACnB,OAAAmJ,EAAO,WAAW,EACXnJ,EAAK,YAAY,WAC1B,KAAKA,EAAK,WAAW,cACnB,OAAOA,EAAK,YAAY,kBAC1B,KAAKA,EAAK,WAAW,MACnB,OAAOA,EAAK,YAAY,WAC1B,KAAKA,EAAK,WAAW,SACnB,OAAAmJ,EAAO,WAAW,EACXnJ,EAAK,YAAY,cAC1B,QACE,IAAI6N,EAAe,2BAA6BC,EAAW,KAAO,IAClE,MAAM,IAAI9N,EAAK,gBAAiB6N,EAAcC,EAAW,MAAOA,EAAW,GAAG,CAClF,EACF,EAEA9N,EAAK,YAAY,WAAa,SAAUmJ,EAAQ,CAC9C,IAAIwE,EAASxE,EAAO,cAAc,EAElC,GAAIwE,GAAU,KAId,KAAIM,EAAQ,SAASN,EAAO,IAAK,EAAE,EAEnC,GAAI,MAAMM,CAAK,EAAG,CAChB,IAAIJ,EAAe,wBACnB,MAAM,IAAI7N,EAAK,gBAAiB6N,EAAcF,EAAO,MAAOA,EAAO,GAAG,CACxE,CAEAxE,EAAO,cAAc,MAAQ8E,EAE7B,IAAIH,EAAa3E,EAAO,WAAW,EAEnC,GAAI2E,GAAc,KAAW,CAC3B3E,EAAO,WAAW,EAClB,MACF,CAEA,OAAQ2E,EAAW,KAAM,CACvB,KAAK9N,EAAK,WAAW,KACnB,OAAAmJ,EAAO,WAAW,EACXnJ,EAAK,YAAY,UAC1B,KAAKA,EAAK,WAAW,MACnB,OAAAmJ,EAAO,WAAW,EACXnJ,EAAK,YAAY,WAC1B,KAAKA,EAAK,WAAW,cACnB,OAAOA,EAAK,YAAY,kBAC1B,KAAKA,EAAK,WAAW,MACnB,OAAOA,EAAK,YAAY,WAC1B,KAAKA,EAAK,WAAW,SACnB,OAAAmJ,EAAO,WAAW,EACXnJ,EAAK,YAAY,cAC1B,QACE,IAAI6N,EAAe,2BAA6BC,EAAW,KAAO,IAClE,MAAM,IAAI9N,EAAK,gBAAiB6N,EAAcC,EAAW,MAAOA,EAAW,GAAG,CAClF,EACF,EAMI,SAAU1G,EAAM8G,EAAS,CACrB,OAAO,QAAW,YAAc,OAAO,IAEzC,OAAOA,CAAO,EACL,OAAOpO,GAAY,SAM5BC,GAAO,QAAUmO,EAAQ,EAGzB9G,EAAK,KAAO8G,EAAQ,CAExB,EAAE,KAAM,UAAY,CAMlB,OAAOlO,CACT,CAAC,CACH,GAAG,IC53GH,IAAAmO,GAAiB,SCiDV,SAASC,GACdC,EAAkBC,EAAmB,SAClC,CACH,IAAMC,EAAKC,GAAsBH,EAAUC,CAAI,EAC/C,GAAI,OAAOC,GAAO,YAChB,MAAM,IAAI,eACR,8BAA8BF,CAAQ,iBACxC,EAGF,OAAOE,CACT,CAsBO,SAASC,GACdH,EAAkBC,EAAmB,SACtB,CACf,OAAOA,EAAK,cAAiBD,CAAQ,GAAK,MAC5C,CCjFK,OAAO,UACV,OAAO,QAAU,SAAUI,EAAa,CACtC,IAAMC,EAA2B,CAAC,EAClC,QAAWC,KAAO,OAAO,KAAKF,CAAG,EAE/BC,EAAK,KAAK,CAACC,EAAKF,EAAIE,CAAG,CAAC,CAAC,EAG3B,OAAOD,CACT,GAGG,OAAO,SACV,OAAO,OAAS,SAAUD,EAAa,CACrC,IAAMC,EAAiB,CAAC,EACxB,QAAWC,KAAO,OAAO,KAAKF,CAAG,EAE/BC,EAAK,KAAKD,EAAIE,CAAG,CAAC,EAGpB,OAAOD,CACT,GAKE,OAAO,SAAY,cAGhB,QAAQ,UAAU,WACrB,QAAQ,UAAU,SAAW,SAC3BE,EAA8BC,EACxB,CACF,OAAOD,GAAM,UACf,KAAK,WAAaA,EAAE,KACpB,KAAK,UAAYA,EAAE,MAEnB,KAAK,WAAaA,EAClB,KAAK,UAAYC,EAErB,GAGG,QAAQ,UAAU,cACrB,QAAQ,UAAU,YAAc,YAC3BC,EACG,CACN,IAAMC,EAAS,KAAK,WACpB,GAAIA,EAAQ,CACND,EAAM,SAAW,GACnBC,EAAO,YAAY,IAAI,EAGzB,QAASC,EAAIF,EAAM,OAAS,EAAGE,GAAK,EAAGA,IAAK,CAC1C,IAAIC,EAAOH,EAAME,CAAC,EACd,OAAOC,GAAS,SAClBA,EAAO,SAAS,eAAeA,CAAI,EAC5BA,EAAK,YACZA,EAAK,WAAW,YAAYA,CAAI,EAG7BD,EAGHD,EAAO,aAAa,KAAK,gBAAkBE,CAAI,EAF/CF,EAAO,aAAaE,EAAM,IAAI,CAGlC,CACF,CACF,ICDG,SAASC,GACdC,EAC6B,CAC7B,IAAMC,EAAM,IAAI,IAChB,QAAWC,KAAOF,EAAM,CACtB,GAAM,CAACG,CAAI,EAAID,EAAI,SAAS,MAAM,GAAG,EAG/BE,EAAUH,EAAI,IAAIE,CAAI,EACxB,OAAOC,GAAY,YACrBH,EAAI,IAAIE,EAAMD,CAAG,GAIjBD,EAAI,IAAIC,EAAI,SAAUA,CAAG,EACzBA,EAAI,OAASE,EAEjB,CAGA,OAAOH,CACT,CCnEO,SAASI,EACdC,EAAeC,EAAmBC,EAC5B,CAjDR,IAAAC,EAkDEF,EAAY,IAAI,OAAOA,EAAW,GAAG,EAGrC,IAAIG,EACAC,EAAQ,EACZ,EAAG,CACDD,EAAQH,EAAU,KAAKD,CAAK,EAG5B,IAAMM,GAAQH,EAAAC,GAAA,YAAAA,EAAO,QAAP,KAAAD,EAAgBH,EAAM,OAKpC,GAJIK,EAAQC,GACVJ,EAAGG,EAAOC,CAAK,EAGbF,EAAO,CACT,GAAM,CAACG,CAAI,EAAIH,EACfC,EAAQD,EAAM,MAAQG,EAAK,OAGvBA,EAAK,SAAW,IAClBN,EAAU,UAAYG,EAAM,MAAQ,EACxC,CACF,OAASA,EACX,CCFO,SAASI,GACdC,EAAeC,EACT,CAEN,IAAIC,EAAQ,EACRC,EAAQ,EACRC,EAAM,EAGV,QAASC,EAAQ,EAAGD,EAAMJ,EAAM,OAAQI,IAGlCJ,EAAM,OAAOI,CAAG,IAAM,KAAOA,EAAMD,EACrCF,EAAGC,EAAO,EAAcC,EAAOA,EAAQC,CAAG,EAGjCJ,EAAM,OAAOI,CAAG,IAAM,MAC3BJ,EAAM,OAAOG,EAAQ,CAAC,IAAM,IAC1B,EAAEE,IAAU,GACdJ,EAAGC,IAAS,EAAmBC,EAAOC,EAAM,CAAC,EAGtCJ,EAAM,OAAOI,EAAM,CAAC,IAAM,KAC/BC,MAAY,GACdJ,EAAGC,EAAO,EAAkBC,EAAOC,EAAM,CAAC,EAI9CD,EAAQC,EAAM,GAKdA,EAAMD,GACRF,EAAGC,EAAO,EAAcC,EAAOC,CAAG,CACtC,CCnDO,SAASE,GACdC,EAAeC,EAAsBC,EAAuBC,EAAO,GAC3D,CACR,OAAOC,EAAa,CAACJ,CAAK,EAAGC,EAAOC,EAAWC,CAAI,EAAE,IAAI,CAC3D,CAYO,SAASC,EACdC,EAAkBJ,EAAsBC,EAAuBC,EAAO,GAC5D,CAGV,IAAMG,EAAU,CAAC,CAAC,EAClB,QAASC,EAAI,EAAGA,EAAIN,EAAM,OAAQM,IAAK,CACrC,IAAMC,EAAOP,EAAMM,EAAI,CAAC,EAClBE,EAAOR,EAAMM,CAAC,EAGdG,EAAIF,EAAKA,EAAK,OAAS,CAAC,IAAM,EAAI,KAClCG,EAAIF,EAAK,CAAC,IAAoB,GAGpCH,EAAQ,KAAK,EAAEI,EAAIC,GAAKL,EAAQA,EAAQ,OAAS,CAAC,CAAC,CACrD,CAGA,OAAOD,EAAO,IAAI,CAACL,EAAOY,IAAM,CAC9B,IAAIC,EAAS,EAGPC,EAAS,IAAI,IACnB,QAAWJ,KAAKR,EAAU,KAAK,CAACa,EAAGC,IAAMD,EAAIC,CAAC,EAAG,CAC/C,IAAMC,EAAQP,EAAI,QACZQ,EAAQR,IAAM,GACpB,GAAIJ,EAAQY,CAAK,IAAMN,EACrB,SAGF,IAAIO,EAAQL,EAAO,IAAII,CAAK,EACxB,OAAOC,GAAU,aACnBL,EAAO,IAAII,EAAOC,EAAQ,CAAC,CAAC,EAG9BA,EAAM,KAAKF,CAAK,CAClB,CAGA,GAAIH,EAAO,OAAS,EAClB,OAAOd,EAGT,IAAMoB,EAAmB,CAAC,EAC1B,OAAW,CAACF,EAAOG,CAAO,IAAKP,EAAQ,CACrC,IAAMP,EAAIN,EAAMiB,CAAK,EAGfI,EAASf,EAAE,CAAC,IAAiB,GAC7BgB,EAAShB,EAAEA,EAAE,OAAS,CAAC,IAAM,GAC7BiB,EAASjB,EAAEA,EAAE,OAAS,CAAC,IAAM,EAAI,KAGnCJ,GAAQmB,EAAQT,GAClBO,EAAO,KAAKpB,EAAM,MAAMa,EAAQS,CAAK,CAAC,EAGxC,IAAIG,EAAQzB,EAAM,MAAMsB,EAAOC,EAAMC,CAAM,EAC3C,QAAWE,KAAKL,EAAQ,KAAK,CAACN,EAAGC,IAAMA,EAAID,CAAC,EAAG,CAG7C,IAAML,GAAKH,EAAEmB,CAAC,IAAM,IAAMJ,EACpBX,GAAKJ,EAAEmB,CAAC,IAAM,EAAI,MAAShB,EAGjCe,EAAQ,CACNA,EAAM,MAAM,EAAGf,CAAC,EAChB,SACAe,EAAM,MAAMf,EAAGC,CAAC,EAChB,UACAc,EAAM,MAAMd,CAAC,CACf,EAAE,KAAK,EAAE,CACX,CAMA,GAHAE,EAASU,EAAMC,EAGXJ,EAAO,KAAKK,CAAK,IAAM,EACzB,KACJ,CAGA,OAAItB,GAAQU,EAASb,EAAM,QACzBoB,EAAO,KAAKpB,EAAM,MAAMa,CAAM,CAAC,EAG1BO,EAAO,KAAK,EAAE,CACvB,CAAC,CACH,CChHO,SAASO,GACdC,EACc,CACd,IAAMC,EAAuB,CAAC,EAC9B,GAAI,OAAOD,GAAU,YACnB,OAAOC,EAGT,IAAMC,EAAS,MAAM,QAAQF,CAAK,EAAIA,EAAQ,CAACA,CAAK,EACpD,QAASG,EAAI,EAAGA,EAAID,EAAO,OAAQC,IAAK,CACtC,IAAMC,EAAQ,KAAK,UAAU,MACvBC,EAAQD,EAAM,OAGpBE,GAAQJ,EAAOC,CAAC,EAAG,CAACI,EAAOC,EAAMC,EAAOC,IAAQ,CA/DpD,IAAAC,EAiEM,OADAP,EAAAO,EAAMJ,GAASF,KAAfD,EAAAO,GAA0B,CAAC,GACnBH,EAAM,CAGZ,OACA,OACEJ,EAAMG,CAAK,EAAE,KACXE,GAAe,GACfC,EAAMD,GAAU,EAChBD,CACF,EACA,MAGF,OACE,IAAMI,EAAUV,EAAOC,CAAC,EAAE,MAAMM,EAAOC,CAAG,EAC1CG,EAAMD,EAAS,KAAK,UAAU,UAAW,CAACE,EAAOC,IAAU,CAOzD,GAAI,OAAO,KAAK,WAAc,YAAa,CACzC,IAAMC,EAAaJ,EAAQ,MAAME,EAAOC,CAAK,EAC7C,GAAI,WAAW,KAAK,KAAK,UAAU,OAAOC,CAAU,CAAC,EAAG,CACtD,IAAMC,EAAW,KAAK,UAAU,QAAQD,CAAU,EAClD,QAASE,EAAI,EAAGC,EAAI,EAAGD,EAAID,EAAS,OAAQC,IAG1Cd,EAAAG,KAAAH,EAAAG,GAAiB,CAAC,GAClBH,EAAMG,CAAK,EAAE,KACXE,EAAQK,EAAQK,GAAM,GACtBF,EAASC,CAAC,EAAE,QAAW,EACvBV,CACF,EAGAP,EAAO,KAAK,IAAI,KAAK,MACnBgB,EAASC,CAAC,EAAE,YAAY,EAAG,CACzB,SAAUX,GAAS,GAAKH,EAAMG,CAAK,EAAE,OAAS,CAChD,CACF,CAAC,EAGDY,GAAKF,EAASC,CAAC,EAAE,OAEnB,MACF,CACF,CAGAd,EAAMG,CAAK,EAAE,KACXE,EAAQK,GAAS,GACjBC,EAAQD,GAAU,EAClBN,CACF,EAGAP,EAAO,KAAK,IAAI,KAAK,MACnBW,EAAQ,MAAME,EAAOC,CAAK,EAAE,YAAY,EAAG,CACzC,SAAUR,GAAS,GAAKH,EAAMG,CAAK,EAAE,OAAS,CAChD,CACF,CAAC,CACH,CAAC,CACL,CACF,CAAC,CACH,CAGA,OAAON,CACT,CCjEO,SAASmB,GACdC,EAAeC,EAAgBC,GAAQA,EAC/B,CACR,OAAOF,EAGJ,KAAK,EAGL,MAAM,YAAY,EAChB,IAAI,CAACG,EAAOC,IAAUA,EAAQ,EAC3BD,EAAM,QAAQ,+BAAgC,IAAI,EAClDA,CACJ,EACC,KAAK,EAAE,EAGT,QAAQ,kCAAmC,EAAE,EAG7C,MAAM,MAAM,EACV,OAAO,CAACE,EAAMH,IAAS,CACtB,IAAMI,EAAOL,EAAGC,CAAI,EACpB,MAAO,CAAC,GAAGG,EAAM,GAAG,MAAM,QAAQC,CAAI,EAAIA,EAAO,CAACA,CAAI,CAAC,CACzD,EAAG,CAAC,CAAa,EAChB,IAAIJ,GAAQ,UAAU,KAAKA,CAAI,EAAI,GAAGA,CAAI,IAAMA,CAAI,EACpD,IAAIA,GAAQ,mBAAmB,KAAKA,CAAI,EAAIA,EAAO,GAAGA,CAAI,GAAG,EAC7D,KAAK,GAAG,CACf,CCxCO,SAASK,GACdC,EACQ,CAGR,OAAOC,GAAUD,EAAOE,GAAQ,CAC9B,IAAMC,EAAkB,CAAC,EAGnBC,EAAQ,IAAI,KAAK,WAAWF,CAAI,EACtCE,EAAM,IAAI,EAGV,OAAW,CAAE,KAAAC,EAAM,IAAKC,EAAM,MAAAC,EAAO,IAAAC,CAAI,IAAKJ,EAAM,QAClD,OAAQC,EAAM,CAGZ,IAAK,QACE,CAAC,QAAS,OAAQ,MAAM,EAAE,SAASC,CAAI,IAC1CJ,EAAO,CACLA,EAAK,MAAM,EAAGM,CAAG,EACjB,IACAN,EAAK,MAAMM,EAAM,CAAC,CACpB,EAAE,KAAK,EAAE,GACX,MAGF,IAAK,OACHC,EAAMH,EAAM,KAAK,UAAU,UAAW,IAAII,IAAU,CAClDP,EAAM,KAAK,CACTD,EAAK,MAAM,EAAGK,CAAK,EACnBD,EAAK,MAAM,GAAGI,CAAK,EACnBR,EAAK,MAAMM,CAAG,CAChB,EAAE,KAAK,EAAE,CAAC,CACZ,CAAC,CACL,CAGF,OAAOL,CACT,CAAC,CACH,CAgBO,SAASQ,GACdC,EACqB,CACrB,IAAMZ,EAAS,IAAI,KAAK,MAAM,CAAC,QAAS,OAAQ,MAAM,CAAC,EACxC,IAAI,KAAK,YAAYY,EAAOZ,CAAK,EAGzC,MAAM,EACb,QAAWa,KAAUb,EAAM,QACzBa,EAAO,YAAc,GAGjBA,EAAO,KAAK,WAAW,GAAG,IAC5BA,EAAO,SAAW,KAAK,MAAM,SAAS,QACtCA,EAAO,KAAOA,EAAO,KAAK,MAAM,CAAC,GAI/BA,EAAO,KAAK,SAAS,GAAG,IAC1BA,EAAO,SAAW,KAAK,MAAM,SAAS,SACtCA,EAAO,KAAOA,EAAO,KAAK,MAAM,EAAG,EAAE,GAKzC,OAAOb,EAAM,OACf,CAUO,SAASc,GACdd,EAA4BG,EACV,CAxJpB,IAAAY,EAyJE,IAAMC,EAAU,IAAI,IAAuBhB,CAAK,EAG1CiB,EAA2B,CAAC,EAClC,QAASC,EAAI,EAAGA,EAAIf,EAAM,OAAQe,IAChC,QAAWL,KAAUG,EACfb,EAAMe,CAAC,EAAE,WAAWL,EAAO,IAAI,IACjCI,EAAOJ,EAAO,IAAI,EAAI,GACtBG,EAAQ,OAAOH,CAAM,GAI3B,QAAWA,KAAUG,GACfD,EAAA,KAAK,iBAAL,MAAAA,EAAA,UAAsBF,EAAO,QAC/BI,EAAOJ,EAAO,IAAI,EAAI,IAG1B,OAAOI,CACT,CClIO,SAASE,GACdC,EAAeC,EACG,CAClB,IAAMC,EAAW,IAAI,IAGfC,EAAW,IAAI,YAAYH,EAAM,MAAM,EAC7C,QAASI,EAAI,EAAGA,EAAIJ,EAAM,OAAQI,IAChC,QAASC,EAAID,EAAI,EAAGC,EAAIL,EAAM,OAAQK,IACtBL,EAAM,MAAMI,EAAGC,CAAC,IACjBJ,IACXE,EAASC,CAAC,EAAIC,EAAID,GAIxB,IAAME,EAAQ,CAAC,CAAC,EAChB,QAAS,EAAIA,EAAM,OAAQ,EAAI,GAAI,CACjC,IAAMC,EAAID,EAAM,EAAE,CAAC,EACnB,QAASE,EAAI,EAAGA,EAAIL,EAASI,CAAC,EAAGC,IAC3BL,EAASI,EAAIC,CAAC,EAAIL,EAASI,CAAC,EAAIC,IAClCN,EAAS,IAAIF,EAAM,MAAMO,EAAGA,EAAIC,CAAC,CAAC,EAClCF,EAAM,GAAG,EAAIC,EAAIC,GAIrB,IAAMA,EAAID,EAAIJ,EAASI,CAAC,EACpBJ,EAASK,CAAC,GAAKA,EAAIR,EAAM,OAAS,IACpCM,EAAM,GAAG,EAAIE,GAGfN,EAAS,IAAIF,EAAM,MAAMO,EAAGC,CAAC,CAAC,CAChC,CAGA,OAAIN,EAAS,IAAI,EAAE,EACV,IAAI,IAAI,CAACF,CAAK,CAAC,EAGjBE,CACT,CCJA,SAASO,GAAUC,EAAmC,CACpD,OAAQC,GACEC,GAAwB,CAC9B,GAAI,OAAOA,EAAID,CAAI,GAAM,YACvB,OAGF,IAAME,EAAK,CAACD,EAAI,SAAUD,CAAI,EAAE,KAAK,GAAG,EACxC,OAAAD,EAAM,IAAIG,EAAI,KAAK,UAAU,MAAQ,CAAC,CAAC,EAGhCD,EAAID,CAAI,CACjB,CAEJ,CAUA,SAASG,GAAWC,EAAaC,EAAuB,CACtD,GAAM,CAACC,EAAGC,CAAC,EAAI,CAAC,IAAI,IAAIH,CAAC,EAAG,IAAI,IAAIC,CAAC,CAAC,EACtC,MAAO,CACL,GAAG,IAAI,IAAI,CAAC,GAAGC,CAAC,EAAE,OAAOE,GAAS,CAACD,EAAE,IAAIC,CAAK,CAAC,CAAC,CAClD,CACF,CASO,IAAMC,EAAN,KAAa,CA2BX,YAAY,CAAE,OAAAC,EAAQ,KAAAC,EAAM,QAAAC,CAAQ,EAAgB,CACzD,IAAMC,EAAQf,GAAU,KAAK,MAAQ,IAAI,GAAK,EAG9C,KAAK,IAAMgB,GAAuBH,CAAI,EACtC,KAAK,QAAUC,EAGf,KAAK,MAAQ,KAAK,UAAY,CAC5B,KAAK,kBAAoB,CAAC,UAAU,EACpC,KAAK,EAAE,CAAC,EAGJF,EAAO,KAAK,SAAW,GAAKA,EAAO,KAAK,CAAC,IAAM,KAEjD,KAAK,IAAI,KAAKA,EAAO,KAAK,CAAC,CAAC,CAAC,EACpBA,EAAO,KAAK,OAAS,GAC9B,KAAK,IAAI,KAAK,cAAc,GAAGA,EAAO,IAAI,CAAC,EAI7C,KAAK,UAAYK,GACjB,KAAK,UAAU,UAAY,IAAI,OAAOL,EAAO,SAAS,EAGtD,KAAK,UAAY,kBAAmB,KAChC,IAAI,KAAK,cACT,OAGJ,IAAMM,EAAMb,GAAW,CACrB,UAAW,iBAAkB,SAC/B,EAAGO,EAAO,QAAQ,EAGlB,QAAWO,KAAQP,EAAO,KAAK,IAAIQ,GAEjCA,IAAa,KAAO,KAAO,KAAKA,CAAQ,CACzC,EACC,QAAWC,KAAMH,EACf,KAAK,SAAS,OAAOC,EAAKE,CAAE,CAAC,EAC7B,KAAK,eAAe,OAAOF,EAAKE,CAAE,CAAC,EAIvC,KAAK,IAAI,UAAU,EAGnB,KAAK,MAAM,QAAS,CAAE,MAAO,IAAK,UAAWN,EAAM,OAAO,CAAE,CAAC,EAC7D,KAAK,MAAM,OAAS,CAAE,MAAO,EAAK,UAAWA,EAAM,MAAM,CAAE,CAAC,EAC5D,KAAK,MAAM,OAAS,CAAE,MAAO,IAAK,UAAWA,EAAM,MAAM,CAAE,CAAC,EAG5D,QAAWZ,KAAOU,EAChB,KAAK,IAAIV,EAAK,CAAE,MAAOA,EAAI,KAAM,CAAC,CACtC,CAAC,CACH,CASO,OAAOmB,EAA6B,CAUzC,GAPAA,EAAQA,EAAM,QAAQ,WAAC,eAAY,IAAE,EAAEZ,GAC9B,CAAC,GAAGa,GAAQb,EAAO,KAAK,MAAM,aAAa,CAAC,EAChD,KAAK,IAAI,CACb,EAGDY,EAAQE,GAAqBF,CAAK,EAC9B,CAACA,EACH,MAAO,CAAE,MAAO,CAAC,CAAE,EAGrB,IAAMG,EAAUC,GAAiBJ,CAAK,EACnC,OAAOK,GACNA,EAAO,WAAa,KAAK,MAAM,SAAS,UACzC,EAGGC,EAAS,KAAK,MAAM,OAAON,CAAK,EAGnC,OAAqB,CAACO,EAAM,CAAE,IAAAC,EAAK,MAAAC,EAAO,UAAAC,CAAU,IAAM,CACzD,IAAI7B,EAAM,KAAK,IAAI,IAAI2B,CAAG,EAC1B,GAAI,OAAO3B,GAAQ,YAAa,CAG9BA,EAAM8B,EAAA,GAAK9B,GACPA,EAAI,OACNA,EAAI,KAAO,CAAC,GAAGA,EAAI,IAAI,GAGzB,IAAM+B,EAAQC,GACZV,EACA,OAAO,KAAKO,EAAU,QAAQ,CAChC,EAGA,QAAWjB,KAAS,KAAK,MAAM,OAAQ,CACrC,GAAI,OAAOZ,EAAIY,CAAK,GAAM,YACxB,SAGF,IAAMqB,EAAwB,CAAC,EAC/B,QAAWC,KAAS,OAAO,OAAOL,EAAU,QAAQ,EAC9C,OAAOK,EAAMtB,CAAK,GAAM,aAC1BqB,EAAU,KAAK,GAAGC,EAAMtB,CAAK,EAAE,QAAQ,EAG3C,GAAI,CAACqB,EAAU,OACb,SAGF,IAAMnC,EAAQ,KAAK,MAAM,IAAI,CAACE,EAAI,SAAUY,CAAK,EAAE,KAAK,GAAG,CAAC,EACtDM,EAAK,MAAM,QAAQlB,EAAIY,CAAK,CAAC,EAC/BuB,EACAC,GAGJpC,EAAIY,CAAK,EAAIM,EAAGlB,EAAIY,CAAK,EAAGd,EAAOmC,EAAWrB,IAAU,MAAM,CAChE,CAGA,IAAMyB,EAAQ,CAAC,CAACrC,EAAI,OAClB,OAAO,OAAO+B,CAAK,EAChB,OAAOO,GAAKA,CAAC,EAAE,OAClB,OAAO,KAAKP,CAAK,EAAE,OAGrBL,EAAK,KAAKa,EAAAT,EAAA,GACL9B,GADK,CAER,MAAO4B,GAAS,EAAIY,EAAAH,EAAS,IAC7B,MAAAN,CACF,EAAC,CACH,CACA,OAAOL,CACT,EAAG,CAAC,CAAC,EAGJ,KAAK,CAACvB,EAAGC,IAAMA,EAAE,MAAQD,EAAE,KAAK,EAGhC,OAAO,CAACsC,EAAOC,IAAW,CACzB,IAAM1C,EAAM,KAAK,IAAI,IAAI0C,EAAO,QAAQ,EACxC,GAAI,OAAO1C,GAAQ,YAAa,CAC9B,IAAM2B,EAAM3B,EAAI,OACZA,EAAI,OAAO,SACXA,EAAI,SACRyC,EAAM,IAAId,EAAK,CAAC,GAAGc,EAAM,IAAId,CAAG,GAAK,CAAC,EAAGe,CAAM,CAAC,CAClD,CACA,OAAOD,CACT,EAAG,IAAI,GAA2B,EAGpC,OAAW,CAACd,EAAKc,CAAK,IAAKhB,EACzB,GAAI,CAACgB,EAAM,KAAKf,GAAQA,EAAK,WAAaC,CAAG,EAAG,CAC9C,IAAM3B,EAAM,KAAK,IAAI,IAAI2B,CAAG,EAC5Bc,EAAM,KAAKF,EAAAT,EAAA,GAAK9B,GAAL,CAAU,MAAO,EAAG,MAAO,CAAC,CAAE,EAAC,CAC5C,CAGF,IAAI2C,EACJ,GAAI,KAAK,QAAQ,QAAS,CACxB,IAAMC,EAAS,KAAK,MAAM,MAAMC,GAAW,CACzC,QAAWrB,KAAUF,EACnBuB,EAAQ,KAAKrB,EAAO,KAAM,CACxB,OAAQ,CAAC,OAAO,EAChB,SAAU,KAAK,MAAM,SAAS,SAC9B,SAAU,KAAK,MAAM,SAAS,QAChC,CAAC,CACL,CAAC,EAGDmB,EAAUC,EAAO,OACb,OAAO,KAAKA,EAAO,CAAC,EAAE,UAAU,QAAQ,EACxC,CAAC,CACP,CAGA,OAAOd,EAAA,CACL,MAAO,CAAC,GAAGL,EAAO,OAAO,CAAC,GACvB,OAAOkB,GAAY,aAAe,CAAE,QAAAA,CAAQ,EAEnD,CACF,EX5QA,IAAIG,GAqBJ,SAAeC,GACbC,EACe,QAAAC,EAAA,sBACf,IAAIC,EAAO,UAGX,GAAI,OAAO,QAAW,aAAe,iBAAkB,OAAQ,CAC7D,IAAMC,EAASC,GAA8B,aAAa,EACpD,CAACC,CAAI,EAAIF,EAAO,IAAI,MAAM,SAAS,EAGzCD,EAAOA,EAAK,QAAQ,KAAMG,CAAI,CAChC,CAGA,IAAMC,EAAU,CAAC,EACjB,QAAWC,KAAQP,EAAO,KAAM,CAC9B,OAAQO,EAAM,CAGZ,IAAK,KACHD,EAAQ,KAAK,GAAGJ,CAAI,aAAa,EACjC,MAGF,IAAK,KACL,IAAK,KACHI,EAAQ,KAAK,GAAGJ,CAAI,aAAa,EACjC,KACJ,CAGIK,IAAS,MACXD,EAAQ,KAAK,GAAGJ,CAAI,aAAaK,CAAI,SAAS,CAClD,CAGIP,EAAO,KAAK,OAAS,GACvBM,EAAQ,KAAK,GAAGJ,CAAI,wBAAwB,EAG1CI,EAAQ,SACV,MAAM,cACJ,GAAGJ,CAAI,mCACP,GAAGI,CACL,EACJ,GAaA,SAAsBE,GACpBC,EACwB,QAAAR,EAAA,sBACxB,OAAQQ,EAAQ,KAAM,CAGpB,OACE,aAAMV,GAAqBU,EAAQ,KAAK,MAAM,EAC9CX,GAAQ,IAAIY,EAAOD,EAAQ,IAAI,EACxB,CACL,MACF,EAGF,OACE,IAAME,EAAQF,EAAQ,KACtB,GAAI,CACF,MAAO,CACL,OACA,KAAMX,GAAM,OAAOa,CAAK,CAC1B,CAGF,OAASC,EAAK,CACZ,eAAQ,KAAK,kBAAkBD,CAAK,oCAA+B,EACnE,QAAQ,KAAKC,CAAG,EACT,CACL,OACA,KAAM,CAAE,MAAO,CAAC,CAAE,CACpB,CACF,CAGF,QACE,MAAM,IAAI,UAAU,sBAAsB,CAC9C,CACF,GAOA,KAAK,KAAO,GAAAC,QAGZ,iBAAiB,UAAiBC,GAAMb,EAAA,wBACtC,YAAY,MAAMO,GAAQM,EAAG,IAAI,CAAC,CACpC,EAAC",
+  "names": ["require_lunr", "__commonJSMin", "exports", "module", "lunr", "config", "builder", "global", "message", "obj", "clone", "keys", "key", "val", "docRef", "fieldName", "stringValue", "s", "n", "fieldRef", "elements", "i", "other", "object", "a", "b", "intersection", "element", "posting", "documentCount", "documentsWithTerm", "x", "str", "metadata", "fn", "t", "len", "tokens", "sliceEnd", "sliceStart", "char", "sliceLength", "tokenMetadata", "label", "isRegistered", "serialised", "pipeline", "fnName", "fns", "existingFn", "newFn", "pos", "stackLength", "memo", "j", "result", "k", "token", "index", "start", "end", "pivotPoint", "pivotIndex", "insertIdx", "position", "sumOfSquares", "elementsLength", "otherVector", "dotProduct", "aLen", "bLen", "aVal", "bVal", "output", "step2list", "step3list", "c", "v", "C", "V", "mgr0", "meq1", "mgr1", "s_v", "re_mgr0", "re_mgr1", "re_meq1", "re_s_v", "re_1a", "re2_1a", "re_1b", "re2_1b", "re_1b_2", "re2_1b_2", "re3_1b_2", "re4_1b_2", "re_1c", "re_2", "re_3", "re_4", "re2_4", "re_5", "re_5_1", "re3_5", "porterStemmer", "w", "stem", "suffix", "firstch", "re", "re2", "re3", "re4", "fp", "stopWords", "words", "stopWord", "arr", "clause", "editDistance", "root", "stack", "frame", "noEditNode", "insertionNode", "substitutionNode", "charA", "charB", "transposeNode", "node", "final", "next", "edges", "edge", "labels", "qEdges", "qLen", "nEdges", "nLen", "q", "qEdge", "nEdge", "qNode", "word", "commonPrefix", "nextNode", "downTo", "childKey", "attrs", "queryString", "query", "parser", "matchingFields", "queryVectors", "termFieldCache", "requiredMatches", "prohibitedMatches", "terms", "clauseMatches", "m", "term", "termTokenSet", "expandedTerms", "field", "expandedTerm", "termIndex", "fieldPosting", "matchingDocumentRefs", "termField", "matchingDocumentsSet", "l", "matchingDocumentRef", "matchingFieldRef", "fieldMatch", "allRequiredMatches", "allProhibitedMatches", "matchingFieldRefs", "results", "matches", "fieldVector", "score", "docMatch", "match", "invertedIndex", "fieldVectors", "ref", "serializedIndex", "serializedVectors", "serializedInvertedIndex", "tokenSetBuilder", "tuple", "attributes", "number", "doc", "fields", "extractor", "fieldTerms", "metadataKey", "fieldRefs", "numberOfFields", "accumulator", "documentsWithField", "fieldRefsLength", "termIdfCache", "fieldLength", "termFrequencies", "termsLength", "fieldBoost", "docBoost", "tf", "idf", "scoreWithPrecision", "args", "clonedMetadata", "metadataKeys", "otherMatchData", "allFields", "options", "state", "subSlices", "type", "charCode", "lexer", "lexeme", "completedClause", "errorMessage", "nextLexeme", "possibleFields", "f", "boost", "factory", "import_lunr", "getElement", "selector", "node", "el", "getOptionalElement", "obj", "data", "key", "x", "y", "nodes", "parent", "i", "node", "setupSearchDocumentMap", "docs", "map", "doc", "path", "article", "split", "input", "separator", "fn", "_a", "match", "index", "until", "term", "extract", "input", "fn", "block", "start", "end", "stack", "highlight", "input", "table", "positions", "full", "highlightAll", "inputs", "mapping", "t", "prev", "next", "p", "q", "i", "cursor", "blocks", "a", "b", "index", "block", "group", "slices", "indexes", "start", "end", "length", "slice", "j", "tokenize", "input", "tokens", "inputs", "i", "table", "total", "extract", "block", "type", "start", "end", "_a", "section", "split", "index", "until", "subsection", "segments", "s", "l", "transform", "query", "fn", "term", "parts", "index", "prev", "next", "transformSearchQuery", "query", "transform", "part", "terms", "lexer", "type", "term", "start", "end", "split", "range", "parseSearchQuery", "value", "clause", "getSearchQueryTerms", "_a", "clauses", "result", "t", "segment", "query", "index", "segments", "wordcuts", "i", "j", "stack", "p", "q", "extractor", "table", "name", "doc", "id", "difference", "a", "b", "x", "y", "value", "Search", "config", "docs", "options", "field", "setupSearchDocumentMap", "tokenize", "fns", "lang", "language", "fn", "query", "segment", "transformSearchQuery", "clauses", "parseSearchQuery", "clause", "groups", "item", "ref", "score", "matchData", "__spreadValues", "terms", "getSearchQueryTerms", "positions", "match", "highlightAll", "highlight", "boost", "t", "__spreadProps", "__pow", "items", "result", "suggest", "titles", "builder", "index", "setupSearchLanguages", "config", "__async", "base", "worker", "getElement", "path", "scripts", "lang", "handler", "message", "Search", "query", "err", "lunr", "ev"]
+}
diff --git a/assets/stylesheets/main.76a95c52.min.css b/assets/stylesheets/main.76a95c52.min.css
new file mode 100644
index 0000000..120bca6
--- /dev/null
+++ b/assets/stylesheets/main.76a95c52.min.css
@@ -0,0 +1 @@
+@charset "UTF-8";html{-webkit-text-size-adjust:none;-moz-text-size-adjust:none;text-size-adjust:none;box-sizing:border-box}*,:after,:before{box-sizing:inherit}@media (prefers-reduced-motion){*,:after,:before{transition:none!important}}body{margin:0}a,button,input,label{-webkit-tap-highlight-color:transparent}a{color:inherit;text-decoration:none}hr{border:0;box-sizing:initial;display:block;height:.05rem;overflow:visible;padding:0}small{font-size:80%}sub,sup{line-height:1em}img{border-style:none}table{border-collapse:initial;border-spacing:0}td,th{font-weight:400;vertical-align:top}button{background:#0000;border:0;font-family:inherit;font-size:inherit;margin:0;padding:0}input{border:0;outline:none}:root{--md-primary-fg-color:#4051b5;--md-primary-fg-color--light:#5d6cc0;--md-primary-fg-color--dark:#303fa1;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3;--md-accent-fg-color:#526cfe;--md-accent-fg-color--transparent:#526cfe1a;--md-accent-bg-color:#fff;--md-accent-bg-color--light:#ffffffb3}[data-md-color-scheme=default]{color-scheme:light}[data-md-color-scheme=default] img[src$="#gh-dark-mode-only"],[data-md-color-scheme=default] img[src$="#only-dark"]{display:none}:root,[data-md-color-scheme=default]{--md-hue:225deg;--md-default-fg-color:#000000de;--md-default-fg-color--light:#0000008a;--md-default-fg-color--lighter:#00000052;--md-default-fg-color--lightest:#00000012;--md-default-bg-color:#fff;--md-default-bg-color--light:#ffffffb3;--md-default-bg-color--lighter:#ffffff4d;--md-default-bg-color--lightest:#ffffff1f;--md-code-fg-color:#36464e;--md-code-bg-color:#f5f5f5;--md-code-hl-color:#4287ff;--md-code-hl-color--light:#4287ff1a;--md-code-hl-number-color:#d52a2a;--md-code-hl-special-color:#db1457;--md-code-hl-function-color:#a846b9;--md-code-hl-constant-color:#6e59d9;--md-code-hl-keyword-color:#3f6ec6;--md-code-hl-string-color:#1c7d4d;--md-code-hl-name-color:var(--md-code-fg-color);--md-code-hl-operator-color:var(--md-default-fg-color--light);--md-code-hl-punctuation-color:var(--md-default-fg-color--light);--md-code-hl-comment-color:var(--md-default-fg-color--light);--md-code-hl-generic-color:var(--md-default-fg-color--light);--md-code-hl-variable-color:var(--md-default-fg-color--light);--md-typeset-color:var(--md-default-fg-color);--md-typeset-a-color:var(--md-primary-fg-color);--md-typeset-del-color:#f5503d26;--md-typeset-ins-color:#0bd57026;--md-typeset-kbd-color:#fafafa;--md-typeset-kbd-accent-color:#fff;--md-typeset-kbd-border-color:#b8b8b8;--md-typeset-mark-color:#ffff0080;--md-typeset-table-color:#0000001f;--md-typeset-table-color--light:rgba(0,0,0,.035);--md-admonition-fg-color:var(--md-default-fg-color);--md-admonition-bg-color:var(--md-default-bg-color);--md-warning-fg-color:#000000de;--md-warning-bg-color:#ff9;--md-footer-fg-color:#fff;--md-footer-fg-color--light:#ffffffb3;--md-footer-fg-color--lighter:#ffffff73;--md-footer-bg-color:#000000de;--md-footer-bg-color--dark:#00000052;--md-shadow-z1:0 0.2rem 0.5rem #0000000d,0 0 0.05rem #0000001a;--md-shadow-z2:0 0.2rem 0.5rem #0000001a,0 0 0.05rem #00000040;--md-shadow-z3:0 0.2rem 0.5rem #0003,0 0 0.05rem #00000059}.md-icon svg{fill:currentcolor;display:block;height:1.2rem;width:1.2rem}body{-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;--md-text-font-family:var(--md-text-font,_),-apple-system,BlinkMacSystemFont,Helvetica,Arial,sans-serif;--md-code-font-family:var(--md-code-font,_),SFMono-Regular,Consolas,Menlo,monospace}aside,body,input{font-feature-settings:"kern","liga";color:var(--md-typeset-color);font-family:var(--md-text-font-family)}code,kbd,pre{font-feature-settings:"kern";font-family:var(--md-code-font-family)}:root{--md-typeset-table-sort-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m18 21-4-4h3V7h-3l4-4 4 4h-3v10h3M2 19v-2h10v2M2 13v-2h7v2M2 7V5h4v2H2Z"/></svg>');--md-typeset-table-sort-icon--asc:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 17h3l-4 4-4-4h3V3h2M2 17h10v2H2M6 5v2H2V5m0 6h7v2H2v-2Z"/></svg>');--md-typeset-table-sort-icon--desc:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 7h3l-4-4-4 4h3v14h2M2 17h10v2H2M6 5v2H2V5m0 6h7v2H2v-2Z"/></svg>')}.md-typeset{-webkit-print-color-adjust:exact;color-adjust:exact;font-size:.8rem;line-height:1.6}@media print{.md-typeset{font-size:.68rem}}.md-typeset blockquote,.md-typeset dl,.md-typeset figure,.md-typeset ol,.md-typeset pre,.md-typeset ul{margin-bottom:1em;margin-top:1em}.md-typeset h1{color:var(--md-default-fg-color--light);font-size:2em;line-height:1.3;margin:0 0 1.25em}.md-typeset h1,.md-typeset h2{font-weight:300;letter-spacing:-.01em}.md-typeset h2{font-size:1.5625em;line-height:1.4;margin:1.6em 0 .64em}.md-typeset h3{font-size:1.25em;font-weight:400;letter-spacing:-.01em;line-height:1.5;margin:1.6em 0 .8em}.md-typeset h2+h3{margin-top:.8em}.md-typeset h4{font-weight:700;letter-spacing:-.01em;margin:1em 0}.md-typeset h5,.md-typeset h6{color:var(--md-default-fg-color--light);font-size:.8em;font-weight:700;letter-spacing:-.01em;margin:1.25em 0}.md-typeset h5{text-transform:uppercase}.md-typeset hr{border-bottom:.05rem solid var(--md-default-fg-color--lightest);display:flow-root;margin:1.5em 0}.md-typeset a{color:var(--md-typeset-a-color);word-break:break-word}.md-typeset a,.md-typeset a:before{transition:color 125ms}.md-typeset a:focus,.md-typeset a:hover{color:var(--md-accent-fg-color)}.md-typeset a:focus code,.md-typeset a:hover code{background-color:var(--md-accent-fg-color--transparent)}.md-typeset a code{color:currentcolor;transition:background-color 125ms}.md-typeset a.focus-visible{outline-color:var(--md-accent-fg-color);outline-offset:.2rem}.md-typeset code,.md-typeset kbd,.md-typeset pre{color:var(--md-code-fg-color);direction:ltr;font-variant-ligatures:none}@media print{.md-typeset code,.md-typeset kbd,.md-typeset pre{white-space:pre-wrap}}.md-typeset code{background-color:var(--md-code-bg-color);border-radius:.1rem;-webkit-box-decoration-break:clone;box-decoration-break:clone;font-size:.85em;padding:0 .2941176471em;word-break:break-word}.md-typeset code:not(.focus-visible){-webkit-tap-highlight-color:transparent;outline:none}.md-typeset pre{display:flow-root;line-height:1.4;position:relative}.md-typeset pre>code{-webkit-box-decoration-break:slice;box-decoration-break:slice;box-shadow:none;display:block;margin:0;outline-color:var(--md-accent-fg-color);overflow:auto;padding:.7720588235em 1.1764705882em;scrollbar-color:var(--md-default-fg-color--lighter) #0000;scrollbar-width:thin;touch-action:auto;word-break:normal}.md-typeset pre>code:hover{scrollbar-color:var(--md-accent-fg-color) #0000}.md-typeset pre>code::-webkit-scrollbar{height:.2rem;width:.2rem}.md-typeset pre>code::-webkit-scrollbar-thumb{background-color:var(--md-default-fg-color--lighter)}.md-typeset pre>code::-webkit-scrollbar-thumb:hover{background-color:var(--md-accent-fg-color)}.md-typeset kbd{background-color:var(--md-typeset-kbd-color);border-radius:.1rem;box-shadow:0 .1rem 0 .05rem var(--md-typeset-kbd-border-color),0 .1rem 0 var(--md-typeset-kbd-border-color),0 -.1rem .2rem var(--md-typeset-kbd-accent-color) inset;color:var(--md-default-fg-color);display:inline-block;font-size:.75em;padding:0 .6666666667em;vertical-align:text-top;word-break:break-word}.md-typeset mark{background-color:var(--md-typeset-mark-color);-webkit-box-decoration-break:clone;box-decoration-break:clone;color:inherit;word-break:break-word}.md-typeset abbr{border-bottom:.05rem dotted var(--md-default-fg-color--light);cursor:help;text-decoration:none}.md-typeset small{opacity:.75}[dir=ltr] .md-typeset sub,[dir=ltr] .md-typeset sup{margin-left:.078125em}[dir=rtl] .md-typeset sub,[dir=rtl] .md-typeset sup{margin-right:.078125em}[dir=ltr] .md-typeset blockquote{padding-left:.6rem}[dir=rtl] .md-typeset blockquote{padding-right:.6rem}[dir=ltr] .md-typeset blockquote{border-left:.2rem solid var(--md-default-fg-color--lighter)}[dir=rtl] .md-typeset blockquote{border-right:.2rem solid var(--md-default-fg-color--lighter)}.md-typeset blockquote{color:var(--md-default-fg-color--light);margin-left:0;margin-right:0}.md-typeset ul{list-style-type:disc}[dir=ltr] .md-typeset ol,[dir=ltr] .md-typeset ul{margin-left:.625em}[dir=rtl] .md-typeset ol,[dir=rtl] .md-typeset ul{margin-right:.625em}.md-typeset ol,.md-typeset ul{padding:0}.md-typeset ol:not([hidden]),.md-typeset ul:not([hidden]){display:flow-root}.md-typeset ol ol,.md-typeset ul ol{list-style-type:lower-alpha}.md-typeset ol ol ol,.md-typeset ul ol ol{list-style-type:lower-roman}[dir=ltr] .md-typeset ol li,[dir=ltr] .md-typeset ul li{margin-left:1.25em}[dir=rtl] .md-typeset ol li,[dir=rtl] .md-typeset ul li{margin-right:1.25em}.md-typeset ol li,.md-typeset ul li{margin-bottom:.5em}.md-typeset ol li blockquote,.md-typeset ol li p,.md-typeset ul li blockquote,.md-typeset ul li p{margin:.5em 0}.md-typeset ol li:last-child,.md-typeset ul li:last-child{margin-bottom:0}[dir=ltr] .md-typeset ol li ol,[dir=ltr] .md-typeset ol li ul,[dir=ltr] .md-typeset ul li ol,[dir=ltr] .md-typeset ul li ul{margin-left:.625em}[dir=rtl] .md-typeset ol li ol,[dir=rtl] .md-typeset ol li ul,[dir=rtl] .md-typeset ul li ol,[dir=rtl] .md-typeset ul li ul{margin-right:.625em}.md-typeset ol li ol,.md-typeset ol li ul,.md-typeset ul li ol,.md-typeset ul li ul{margin-bottom:.5em;margin-top:.5em}[dir=ltr] .md-typeset dd{margin-left:1.875em}[dir=rtl] .md-typeset dd{margin-right:1.875em}.md-typeset dd{margin-bottom:1.5em;margin-top:1em}.md-typeset img,.md-typeset svg,.md-typeset video{height:auto;max-width:100%}.md-typeset img[align=left]{margin:1em 1em 1em 0}.md-typeset img[align=right]{margin:1em 0 1em 1em}.md-typeset img[align]:only-child{margin-top:0}.md-typeset figure{display:flow-root;margin:1em auto;max-width:100%;text-align:center;width:-moz-fit-content;width:fit-content}.md-typeset figure img{display:block;margin:0 auto}.md-typeset figcaption{font-style:italic;margin:1em auto;max-width:24rem}.md-typeset iframe{max-width:100%}.md-typeset table:not([class]){background-color:var(--md-default-bg-color);border:.05rem solid var(--md-typeset-table-color);border-radius:.1rem;display:inline-block;font-size:.64rem;max-width:100%;overflow:auto;touch-action:auto}@media print{.md-typeset table:not([class]){display:table}}.md-typeset table:not([class])+*{margin-top:1.5em}.md-typeset table:not([class]) td>:first-child,.md-typeset table:not([class]) th>:first-child{margin-top:0}.md-typeset table:not([class]) td>:last-child,.md-typeset table:not([class]) th>:last-child{margin-bottom:0}.md-typeset table:not([class]) td:not([align]),.md-typeset table:not([class]) th:not([align]){text-align:left}[dir=rtl] .md-typeset table:not([class]) td:not([align]),[dir=rtl] .md-typeset table:not([class]) th:not([align]){text-align:right}.md-typeset table:not([class]) th{font-weight:700;min-width:5rem;padding:.9375em 1.25em;vertical-align:top}.md-typeset table:not([class]) td{border-top:.05rem solid var(--md-typeset-table-color);padding:.9375em 1.25em;vertical-align:top}.md-typeset table:not([class]) tbody tr{transition:background-color 125ms}.md-typeset table:not([class]) tbody tr:hover{background-color:var(--md-typeset-table-color--light);box-shadow:0 .05rem 0 var(--md-default-bg-color) inset}.md-typeset table:not([class]) a{word-break:normal}.md-typeset table th[role=columnheader]{cursor:pointer}[dir=ltr] .md-typeset table th[role=columnheader]:after{margin-left:.5em}[dir=rtl] .md-typeset table th[role=columnheader]:after{margin-right:.5em}.md-typeset table th[role=columnheader]:after{content:"";display:inline-block;height:1.2em;-webkit-mask-image:var(--md-typeset-table-sort-icon);mask-image:var(--md-typeset-table-sort-icon);-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;transition:background-color 125ms;vertical-align:text-bottom;width:1.2em}.md-typeset table th[role=columnheader]:hover:after{background-color:var(--md-default-fg-color--lighter)}.md-typeset table th[role=columnheader][aria-sort=ascending]:after{background-color:var(--md-default-fg-color--light);-webkit-mask-image:var(--md-typeset-table-sort-icon--asc);mask-image:var(--md-typeset-table-sort-icon--asc)}.md-typeset table th[role=columnheader][aria-sort=descending]:after{background-color:var(--md-default-fg-color--light);-webkit-mask-image:var(--md-typeset-table-sort-icon--desc);mask-image:var(--md-typeset-table-sort-icon--desc)}.md-typeset__scrollwrap{margin:1em -.8rem;overflow-x:auto;touch-action:auto}.md-typeset__table{display:inline-block;margin-bottom:.5em;padding:0 .8rem}@media print{.md-typeset__table{display:block}}html .md-typeset__table table{display:table;margin:0;overflow:hidden;width:100%}@media screen and (max-width:44.984375em){.md-content__inner>pre{margin:1em -.8rem}.md-content__inner>pre code{border-radius:0}}.md-typeset .md-author{border-radius:100%;display:block;flex-shrink:0;height:1.6rem;overflow:hidden;position:relative;transition:color 125ms,transform 125ms;width:1.6rem}.md-typeset .md-author img{display:block}.md-typeset .md-author--more{background:var(--md-default-fg-color--lightest);color:var(--md-default-fg-color--lighter);font-size:.6rem;font-weight:700;line-height:1.6rem;text-align:center}.md-typeset .md-author--long{height:2.4rem;width:2.4rem}.md-typeset a.md-author{transform:scale(1)}.md-typeset a.md-author img{border-radius:100%;filter:grayscale(100%) opacity(75%);transition:filter 125ms}.md-typeset a.md-author:focus,.md-typeset a.md-author:hover{transform:scale(1.1);z-index:1}.md-typeset a.md-author:focus img,.md-typeset a.md-author:hover img{filter:grayscale(0)}.md-banner{background-color:var(--md-footer-bg-color);color:var(--md-footer-fg-color);overflow:auto}@media print{.md-banner{display:none}}.md-banner--warning{background-color:var(--md-warning-bg-color);color:var(--md-warning-fg-color)}.md-banner__inner{font-size:.7rem;margin:.6rem auto;padding:0 .8rem}[dir=ltr] .md-banner__button{float:right}[dir=rtl] .md-banner__button{float:left}.md-banner__button{color:inherit;cursor:pointer;transition:opacity .25s}.no-js .md-banner__button{display:none}.md-banner__button:hover{opacity:.7}html{font-size:125%;height:100%;overflow-x:hidden}@media screen and (min-width:100em){html{font-size:137.5%}}@media screen and (min-width:125em){html{font-size:150%}}body{background-color:var(--md-default-bg-color);display:flex;flex-direction:column;font-size:.5rem;min-height:100%;position:relative;width:100%}@media print{body{display:block}}@media screen and (max-width:59.984375em){body[data-md-scrolllock]{position:fixed}}.md-grid{margin-left:auto;margin-right:auto;max-width:61rem}.md-container{display:flex;flex-direction:column;flex-grow:1}@media print{.md-container{display:block}}.md-main{flex-grow:1}.md-main__inner{display:flex;height:100%;margin-top:1.5rem}.md-ellipsis{overflow:hidden;text-overflow:ellipsis}.md-toggle{display:none}.md-option{height:0;opacity:0;position:absolute;width:0}.md-option:checked+label:not([hidden]){display:block}.md-option.focus-visible+label{outline-color:var(--md-accent-fg-color);outline-style:auto}.md-skip{background-color:var(--md-default-fg-color);border-radius:.1rem;color:var(--md-default-bg-color);font-size:.64rem;margin:.5rem;opacity:0;outline-color:var(--md-accent-fg-color);padding:.3rem .5rem;position:fixed;transform:translateY(.4rem);z-index:-1}.md-skip:focus{opacity:1;transform:translateY(0);transition:transform .25s cubic-bezier(.4,0,.2,1),opacity 175ms 75ms;z-index:10}@page{margin:25mm}:root{--md-clipboard-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 21H8V7h11m0-2H8a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h11a2 2 0 0 0 2-2V7a2 2 0 0 0-2-2m-3-4H4a2 2 0 0 0-2 2v14h2V3h12V1Z"/></svg>')}.md-clipboard{border-radius:.1rem;color:var(--md-default-fg-color--lightest);cursor:pointer;height:1.5em;outline-color:var(--md-accent-fg-color);outline-offset:.1rem;position:absolute;right:.5em;top:.5em;transition:color .25s;width:1.5em;z-index:1}@media print{.md-clipboard{display:none}}.md-clipboard:not(.focus-visible){-webkit-tap-highlight-color:transparent;outline:none}:hover>.md-clipboard{color:var(--md-default-fg-color--light)}.md-clipboard:focus,.md-clipboard:hover{color:var(--md-accent-fg-color)}.md-clipboard:after{background-color:currentcolor;content:"";display:block;height:1.125em;margin:0 auto;-webkit-mask-image:var(--md-clipboard-icon);mask-image:var(--md-clipboard-icon);-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;width:1.125em}.md-clipboard--inline{cursor:pointer}.md-clipboard--inline code{transition:color .25s,background-color .25s}.md-clipboard--inline:focus code,.md-clipboard--inline:hover code{background-color:var(--md-accent-fg-color--transparent);color:var(--md-accent-fg-color)}.md-typeset .md-code__content{display:grid}@keyframes consent{0%{opacity:0;transform:translateY(100%)}to{opacity:1;transform:translateY(0)}}@keyframes overlay{0%{opacity:0}to{opacity:1}}.md-consent__overlay{animation:overlay .25s both;-webkit-backdrop-filter:blur(.1rem);backdrop-filter:blur(.1rem);background-color:#0000008a;height:100%;opacity:1;position:fixed;top:0;width:100%;z-index:5}.md-consent__inner{animation:consent .5s cubic-bezier(.1,.7,.1,1) both;background-color:var(--md-default-bg-color);border:0;border-radius:.1rem;bottom:0;box-shadow:0 0 .2rem #0000001a,0 .2rem .4rem #0003;max-height:100%;overflow:auto;padding:0;position:fixed;width:100%;z-index:5}.md-consent__form{padding:.8rem}.md-consent__settings{display:none;margin:1em 0}input:checked+.md-consent__settings{display:block}.md-consent__controls{margin-bottom:.8rem}.md-typeset .md-consent__controls .md-button{display:inline}@media screen and (max-width:44.984375em){.md-typeset .md-consent__controls .md-button{display:block;margin-top:.4rem;text-align:center;width:100%}}.md-consent label{cursor:pointer}.md-content{flex-grow:1;min-width:0}.md-content__inner{margin:0 .8rem 1.2rem;padding-top:.6rem}@media screen and (min-width:76.25em){[dir=ltr] .md-sidebar--primary:not([hidden])~.md-content>.md-content__inner{margin-left:1.2rem}[dir=ltr] .md-sidebar--secondary:not([hidden])~.md-content>.md-content__inner,[dir=rtl] .md-sidebar--primary:not([hidden])~.md-content>.md-content__inner{margin-right:1.2rem}[dir=rtl] .md-sidebar--secondary:not([hidden])~.md-content>.md-content__inner{margin-left:1.2rem}}.md-content__inner:before{content:"";display:block;height:.4rem}.md-content__inner>:last-child{margin-bottom:0}[dir=ltr] .md-content__button{float:right}[dir=rtl] .md-content__button{float:left}[dir=ltr] .md-content__button{margin-left:.4rem}[dir=rtl] .md-content__button{margin-right:.4rem}.md-content__button{margin:.4rem 0;padding:0}@media print{.md-content__button{display:none}}.md-typeset .md-content__button{color:var(--md-default-fg-color--lighter)}.md-content__button svg{display:inline;vertical-align:top}[dir=rtl] .md-content__button svg{transform:scaleX(-1)}[dir=ltr] .md-dialog{right:.8rem}[dir=rtl] .md-dialog{left:.8rem}.md-dialog{background-color:var(--md-default-fg-color);border-radius:.1rem;bottom:.8rem;box-shadow:var(--md-shadow-z3);min-width:11.1rem;opacity:0;padding:.4rem .6rem;pointer-events:none;position:fixed;transform:translateY(100%);transition:transform 0ms .4s,opacity .4s;z-index:4}@media print{.md-dialog{display:none}}.md-dialog--active{opacity:1;pointer-events:auto;transform:translateY(0);transition:transform .4s cubic-bezier(.075,.85,.175,1),opacity .4s}.md-dialog__inner{color:var(--md-default-bg-color);font-size:.7rem}.md-feedback{margin:2em 0 1em;text-align:center}.md-feedback fieldset{border:none;margin:0;padding:0}.md-feedback__title{font-weight:700;margin:1em auto}.md-feedback__inner{position:relative}.md-feedback__list{display:flex;flex-wrap:wrap;place-content:baseline center;position:relative}.md-feedback__list:hover .md-icon:not(:disabled){color:var(--md-default-fg-color--lighter)}:disabled .md-feedback__list{min-height:1.8rem}.md-feedback__icon{color:var(--md-default-fg-color--light);cursor:pointer;flex-shrink:0;margin:0 .1rem;transition:color 125ms}.md-feedback__icon:not(:disabled).md-icon:hover{color:var(--md-accent-fg-color)}.md-feedback__icon:disabled{color:var(--md-default-fg-color--lightest);pointer-events:none}.md-feedback__note{opacity:0;position:relative;transform:translateY(.4rem);transition:transform .4s cubic-bezier(.1,.7,.1,1),opacity .15s}.md-feedback__note>*{margin:0 auto;max-width:16rem}:disabled .md-feedback__note{opacity:1;transform:translateY(0)}.md-footer{background-color:var(--md-footer-bg-color);color:var(--md-footer-fg-color)}@media print{.md-footer{display:none}}.md-footer__inner{justify-content:space-between;overflow:auto;padding:.2rem}.md-footer__inner:not([hidden]){display:flex}.md-footer__link{align-items:end;display:flex;flex-grow:0.01;margin-bottom:.4rem;margin-top:1rem;max-width:100%;outline-color:var(--md-accent-fg-color);overflow:hidden;transition:opacity .25s}.md-footer__link:focus,.md-footer__link:hover{opacity:.7}[dir=rtl] .md-footer__link svg{transform:scaleX(-1)}@media screen and (max-width:44.984375em){.md-footer__link--prev{flex-shrink:0}.md-footer__link--prev .md-footer__title{display:none}}[dir=ltr] .md-footer__link--next{margin-left:auto}[dir=rtl] .md-footer__link--next{margin-right:auto}.md-footer__link--next{text-align:right}[dir=rtl] .md-footer__link--next{text-align:left}.md-footer__title{flex-grow:1;font-size:.9rem;margin-bottom:.7rem;max-width:calc(100% - 2.4rem);padding:0 1rem;white-space:nowrap}.md-footer__button{margin:.2rem;padding:.4rem}.md-footer__direction{font-size:.64rem;opacity:.7}.md-footer-meta{background-color:var(--md-footer-bg-color--dark)}.md-footer-meta__inner{display:flex;flex-wrap:wrap;justify-content:space-between;padding:.2rem}html .md-footer-meta.md-typeset a{color:var(--md-footer-fg-color--light)}html .md-footer-meta.md-typeset a:focus,html .md-footer-meta.md-typeset a:hover{color:var(--md-footer-fg-color)}.md-copyright{color:var(--md-footer-fg-color--lighter);font-size:.64rem;margin:auto .6rem;padding:.4rem 0;width:100%}@media screen and (min-width:45em){.md-copyright{width:auto}}.md-copyright__highlight{color:var(--md-footer-fg-color--light)}.md-social{display:inline-flex;gap:.2rem;margin:0 .4rem;padding:.2rem 0 .6rem}@media screen and (min-width:45em){.md-social{padding:.6rem 0}}.md-social__link{display:inline-block;height:1.6rem;text-align:center;width:1.6rem}.md-social__link:before{line-height:1.9}.md-social__link svg{fill:currentcolor;max-height:.8rem;vertical-align:-25%}.md-typeset .md-button{border:.1rem solid;border-radius:.1rem;color:var(--md-primary-fg-color);cursor:pointer;display:inline-block;font-weight:700;padding:.625em 2em;transition:color 125ms,background-color 125ms,border-color 125ms}.md-typeset .md-button--primary{background-color:var(--md-primary-fg-color);border-color:var(--md-primary-fg-color);color:var(--md-primary-bg-color)}.md-typeset .md-button:focus,.md-typeset .md-button:hover{background-color:var(--md-accent-fg-color);border-color:var(--md-accent-fg-color);color:var(--md-accent-bg-color)}[dir=ltr] .md-typeset .md-input{border-top-left-radius:.1rem}[dir=ltr] .md-typeset .md-input,[dir=rtl] .md-typeset .md-input{border-top-right-radius:.1rem}[dir=rtl] .md-typeset .md-input{border-top-left-radius:.1rem}.md-typeset .md-input{border-bottom:.1rem solid var(--md-default-fg-color--lighter);box-shadow:var(--md-shadow-z1);font-size:.8rem;height:1.8rem;padding:0 .6rem;transition:border .25s,box-shadow .25s}.md-typeset .md-input:focus,.md-typeset .md-input:hover{border-bottom-color:var(--md-accent-fg-color);box-shadow:var(--md-shadow-z2)}.md-typeset .md-input--stretch{width:100%}.md-header{background-color:var(--md-primary-fg-color);box-shadow:0 0 .2rem #0000,0 .2rem .4rem #0000;color:var(--md-primary-bg-color);display:block;left:0;position:sticky;right:0;top:0;z-index:4}@media print{.md-header{display:none}}.md-header[hidden]{transform:translateY(-100%);transition:transform .25s cubic-bezier(.8,0,.6,1),box-shadow .25s}.md-header--shadow{box-shadow:0 0 .2rem #0000001a,0 .2rem .4rem #0003;transition:transform .25s cubic-bezier(.1,.7,.1,1),box-shadow .25s}.md-header__inner{align-items:center;display:flex;padding:0 .2rem}.md-header__button{color:currentcolor;cursor:pointer;margin:.2rem;outline-color:var(--md-accent-fg-color);padding:.4rem;position:relative;transition:opacity .25s;vertical-align:middle;z-index:1}.md-header__button:hover{opacity:.7}.md-header__button:not([hidden]){display:inline-block}.md-header__button:not(.focus-visible){-webkit-tap-highlight-color:transparent;outline:none}.md-header__button.md-logo{margin:.2rem;padding:.4rem}@media screen and (max-width:76.234375em){.md-header__button.md-logo{display:none}}.md-header__button.md-logo img,.md-header__button.md-logo svg{fill:currentcolor;display:block;height:1.2rem;width:auto}@media screen and (min-width:60em){.md-header__button[for=__search]{display:none}}.no-js .md-header__button[for=__search]{display:none}[dir=rtl] .md-header__button[for=__search] svg{transform:scaleX(-1)}@media screen and (min-width:76.25em){.md-header__button[for=__drawer]{display:none}}.md-header__topic{display:flex;max-width:100%;position:absolute;transition:transform .4s cubic-bezier(.1,.7,.1,1),opacity .15s;white-space:nowrap}.md-header__topic+.md-header__topic{opacity:0;pointer-events:none;transform:translateX(1.25rem);transition:transform .4s cubic-bezier(1,.7,.1,.1),opacity .15s;z-index:-1}[dir=rtl] .md-header__topic+.md-header__topic{transform:translateX(-1.25rem)}.md-header__topic:first-child{font-weight:700}[dir=ltr] .md-header__title{margin-left:1rem;margin-right:.4rem}[dir=rtl] .md-header__title{margin-left:.4rem;margin-right:1rem}.md-header__title{flex-grow:1;font-size:.9rem;height:2.4rem;line-height:2.4rem}.md-header__title--active .md-header__topic{opacity:0;pointer-events:none;transform:translateX(-1.25rem);transition:transform .4s cubic-bezier(1,.7,.1,.1),opacity .15s;z-index:-1}[dir=rtl] .md-header__title--active .md-header__topic{transform:translateX(1.25rem)}.md-header__title--active .md-header__topic+.md-header__topic{opacity:1;pointer-events:auto;transform:translateX(0);transition:transform .4s cubic-bezier(.1,.7,.1,1),opacity .15s;z-index:0}.md-header__title>.md-header__ellipsis{height:100%;position:relative;width:100%}.md-header__option{display:flex;flex-shrink:0;max-width:100%;transition:max-width 0ms .25s,opacity .25s .25s;white-space:nowrap}[data-md-toggle=search]:checked~.md-header .md-header__option{max-width:0;opacity:0;transition:max-width 0ms,opacity 0ms}.md-header__option>input{bottom:0}.md-header__source{display:none}@media screen and (min-width:60em){[dir=ltr] .md-header__source{margin-left:1rem}[dir=rtl] .md-header__source{margin-right:1rem}.md-header__source{display:block;max-width:11.7rem;width:11.7rem}}@media screen and (min-width:76.25em){[dir=ltr] .md-header__source{margin-left:1.4rem}[dir=rtl] .md-header__source{margin-right:1.4rem}}.md-meta{color:var(--md-default-fg-color--light);font-size:.7rem;line-height:1.3}.md-meta__list{display:inline-flex;flex-wrap:wrap;list-style:none;margin:0;padding:0}.md-meta__item:not(:last-child):after{content:"·";margin-left:.2rem;margin-right:.2rem}.md-meta__link{color:var(--md-typeset-a-color)}.md-meta__link:focus,.md-meta__link:hover{color:var(--md-accent-fg-color)}.md-draft{background-color:#ff1744;border-radius:.125em;color:#fff;display:inline-block;font-weight:700;padding-left:.5714285714em;padding-right:.5714285714em}:root{--md-nav-icon--prev:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>');--md-nav-icon--next:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M8.59 16.58 13.17 12 8.59 7.41 10 6l6 6-6 6-1.41-1.42Z"/></svg>');--md-toc-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 9h14V7H3v2m0 4h14v-2H3v2m0 4h14v-2H3v2m16 0h2v-2h-2v2m0-10v2h2V7h-2m0 6h2v-2h-2v2Z"/></svg>')}.md-nav{font-size:.7rem;line-height:1.3}.md-nav__title{color:var(--md-default-fg-color--light);display:block;font-weight:700;overflow:hidden;padding:0 .6rem;text-overflow:ellipsis}.md-nav__title .md-nav__button{display:none}.md-nav__title .md-nav__button img{height:100%;width:auto}.md-nav__title .md-nav__button.md-logo img,.md-nav__title .md-nav__button.md-logo svg{fill:currentcolor;display:block;height:2.4rem;max-width:100%;object-fit:contain;width:auto}.md-nav__list{list-style:none;margin:0;padding:0}.md-nav__link{align-items:flex-start;display:flex;gap:.4rem;margin-top:.625em;scroll-snap-align:start;transition:color 125ms}.md-nav__link--passed{color:var(--md-default-fg-color--light)}.md-nav__item .md-nav__link--active,.md-nav__item .md-nav__link--active code{color:var(--md-typeset-a-color)}.md-nav__link .md-ellipsis{position:relative}[dir=ltr] .md-nav__link .md-icon:last-child{margin-left:auto}[dir=rtl] .md-nav__link .md-icon:last-child{margin-right:auto}.md-nav__link svg{fill:currentcolor;flex-shrink:0;height:1.3em}.md-nav__link[for]:focus,.md-nav__link[for]:hover,.md-nav__link[href]:focus,.md-nav__link[href]:hover{color:var(--md-accent-fg-color);cursor:pointer}.md-nav__link.focus-visible{outline-color:var(--md-accent-fg-color);outline-offset:.2rem}.md-nav--primary .md-nav__link[for=__toc]{display:none}.md-nav--primary .md-nav__link[for=__toc] .md-icon:after{background-color:currentcolor;display:block;height:100%;-webkit-mask-image:var(--md-toc-icon);mask-image:var(--md-toc-icon);width:100%}.md-nav--primary .md-nav__link[for=__toc]~.md-nav{display:none}.md-nav__container>.md-nav__link{margin-top:0}.md-nav__container>.md-nav__link:first-child{flex-grow:1;min-width:0}.md-nav__icon{flex-shrink:0}.md-nav__source{display:none}@media screen and (max-width:76.234375em){.md-nav--primary,.md-nav--primary .md-nav{background-color:var(--md-default-bg-color);display:flex;flex-direction:column;height:100%;left:0;position:absolute;right:0;top:0;z-index:1}.md-nav--primary .md-nav__item,.md-nav--primary .md-nav__title{font-size:.8rem;line-height:1.5}.md-nav--primary .md-nav__title{background-color:var(--md-default-fg-color--lightest);color:var(--md-default-fg-color--light);cursor:pointer;height:5.6rem;line-height:2.4rem;padding:3rem .8rem .2rem;position:relative;white-space:nowrap}[dir=ltr] .md-nav--primary .md-nav__title .md-nav__icon{left:.4rem}[dir=rtl] .md-nav--primary .md-nav__title .md-nav__icon{right:.4rem}.md-nav--primary .md-nav__title .md-nav__icon{display:block;height:1.2rem;margin:.2rem;position:absolute;top:.4rem;width:1.2rem}.md-nav--primary .md-nav__title .md-nav__icon:after{background-color:currentcolor;content:"";display:block;height:100%;-webkit-mask-image:var(--md-nav-icon--prev);mask-image:var(--md-nav-icon--prev);-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;width:100%}.md-nav--primary .md-nav__title~.md-nav__list{background-color:var(--md-default-bg-color);box-shadow:0 .05rem 0 var(--md-default-fg-color--lightest) inset;overflow-y:auto;scroll-snap-type:y mandatory;touch-action:pan-y}.md-nav--primary .md-nav__title~.md-nav__list>:first-child{border-top:0}.md-nav--primary .md-nav__title[for=__drawer]{background-color:var(--md-primary-fg-color);color:var(--md-primary-bg-color);font-weight:700}.md-nav--primary .md-nav__title .md-logo{display:block;left:.2rem;margin:.2rem;padding:.4rem;position:absolute;right:.2rem;top:.2rem}.md-nav--primary .md-nav__list{flex:1}.md-nav--primary .md-nav__item{border-top:.05rem solid var(--md-default-fg-color--lightest)}.md-nav--primary .md-nav__item--active>.md-nav__link{color:var(--md-typeset-a-color)}.md-nav--primary .md-nav__item--active>.md-nav__link:focus,.md-nav--primary .md-nav__item--active>.md-nav__link:hover{color:var(--md-accent-fg-color)}.md-nav--primary .md-nav__link{margin-top:0;padding:.6rem .8rem}.md-nav--primary .md-nav__link svg{margin-top:.1em}.md-nav--primary .md-nav__link>.md-nav__link{padding:0}[dir=ltr] .md-nav--primary .md-nav__link .md-nav__icon{margin-right:-.2rem}[dir=rtl] .md-nav--primary .md-nav__link .md-nav__icon{margin-left:-.2rem}.md-nav--primary .md-nav__link .md-nav__icon{font-size:1.2rem;height:1.2rem;width:1.2rem}.md-nav--primary .md-nav__link .md-nav__icon:after{background-color:currentcolor;content:"";display:block;height:100%;-webkit-mask-image:var(--md-nav-icon--next);mask-image:var(--md-nav-icon--next);-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;width:100%}[dir=rtl] .md-nav--primary .md-nav__icon:after{transform:scale(-1)}.md-nav--primary .md-nav--secondary .md-nav{background-color:initial;position:static}[dir=ltr] .md-nav--primary .md-nav--secondary .md-nav .md-nav__link{padding-left:1.4rem}[dir=rtl] .md-nav--primary .md-nav--secondary .md-nav .md-nav__link{padding-right:1.4rem}[dir=ltr] .md-nav--primary .md-nav--secondary .md-nav .md-nav .md-nav__link{padding-left:2rem}[dir=rtl] .md-nav--primary .md-nav--secondary .md-nav .md-nav .md-nav__link{padding-right:2rem}[dir=ltr] .md-nav--primary .md-nav--secondary .md-nav .md-nav .md-nav .md-nav__link{padding-left:2.6rem}[dir=rtl] .md-nav--primary .md-nav--secondary .md-nav .md-nav .md-nav .md-nav__link{padding-right:2.6rem}[dir=ltr] .md-nav--primary .md-nav--secondary .md-nav .md-nav .md-nav .md-nav .md-nav__link{padding-left:3.2rem}[dir=rtl] .md-nav--primary .md-nav--secondary .md-nav .md-nav .md-nav .md-nav .md-nav__link{padding-right:3.2rem}.md-nav--secondary{background-color:initial}.md-nav__toggle~.md-nav{display:flex;opacity:0;transform:translateX(100%);transition:transform .25s cubic-bezier(.8,0,.6,1),opacity 125ms 50ms}[dir=rtl] .md-nav__toggle~.md-nav{transform:translateX(-100%)}.md-nav__toggle:checked~.md-nav{opacity:1;transform:translateX(0);transition:transform .25s cubic-bezier(.4,0,.2,1),opacity 125ms 125ms}.md-nav__toggle:checked~.md-nav>.md-nav__list{-webkit-backface-visibility:hidden;backface-visibility:hidden}}@media screen and (max-width:59.984375em){.md-nav--primary .md-nav__link[for=__toc]{display:flex}.md-nav--primary .md-nav__link[for=__toc] .md-icon:after{content:""}.md-nav--primary .md-nav__link[for=__toc]+.md-nav__link{display:none}.md-nav--primary .md-nav__link[for=__toc]~.md-nav{display:flex}.md-nav__source{background-color:var(--md-primary-fg-color--dark);color:var(--md-primary-bg-color);display:block;padding:0 .2rem}}@media screen and (min-width:60em) and (max-width:76.234375em){.md-nav--integrated .md-nav__link[for=__toc]{display:flex}.md-nav--integrated .md-nav__link[for=__toc] .md-icon:after{content:""}.md-nav--integrated .md-nav__link[for=__toc]+.md-nav__link{display:none}.md-nav--integrated .md-nav__link[for=__toc]~.md-nav{display:flex}}@media screen and (min-width:60em){.md-nav{margin-bottom:-.4rem}.md-nav--secondary .md-nav__title{background:var(--md-default-bg-color);box-shadow:0 0 .4rem .4rem var(--md-default-bg-color);position:sticky;top:0;z-index:1}.md-nav--secondary .md-nav__title[for=__toc]{scroll-snap-align:start}.md-nav--secondary .md-nav__title .md-nav__icon{display:none}[dir=ltr] .md-nav--secondary .md-nav__list{padding-left:.6rem}[dir=rtl] .md-nav--secondary .md-nav__list{padding-right:.6rem}.md-nav--secondary .md-nav__list{padding-bottom:.4rem}[dir=ltr] .md-nav--secondary .md-nav__item>.md-nav__link{margin-right:.4rem}[dir=rtl] .md-nav--secondary .md-nav__item>.md-nav__link{margin-left:.4rem}}@media screen and (min-width:76.25em){.md-nav{margin-bottom:-.4rem;transition:max-height .25s cubic-bezier(.86,0,.07,1)}.md-nav--primary .md-nav__title{background:var(--md-default-bg-color);box-shadow:0 0 .4rem .4rem var(--md-default-bg-color);position:sticky;top:0;z-index:1}.md-nav--primary .md-nav__title[for=__drawer]{scroll-snap-align:start}.md-nav--primary .md-nav__title .md-nav__icon{display:none}[dir=ltr] .md-nav--primary .md-nav__list{padding-left:.6rem}[dir=rtl] .md-nav--primary .md-nav__list{padding-right:.6rem}.md-nav--primary .md-nav__list{padding-bottom:.4rem}[dir=ltr] .md-nav--primary .md-nav__item>.md-nav__link{margin-right:.4rem}[dir=rtl] .md-nav--primary .md-nav__item>.md-nav__link{margin-left:.4rem}.md-nav__toggle~.md-nav{display:grid;grid-template-rows:0fr;opacity:0;transition:grid-template-rows .25s cubic-bezier(.86,0,.07,1),opacity .25s,visibility 0ms .25s;visibility:collapse}.md-nav__toggle~.md-nav>.md-nav__list{overflow:hidden}.md-nav__toggle.md-toggle--indeterminate~.md-nav,.md-nav__toggle:checked~.md-nav{grid-template-rows:1fr;opacity:1;transition:grid-template-rows .25s cubic-bezier(.86,0,.07,1),opacity .15s .1s,visibility 0ms;visibility:visible}.md-nav__toggle.md-toggle--indeterminate~.md-nav{transition:none}.md-nav__item--nested>.md-nav>.md-nav__title{display:none}.md-nav__item--section{display:block;margin:1.25em 0}.md-nav__item--section:last-child{margin-bottom:0}.md-nav__item--section>.md-nav__link{font-weight:700}.md-nav__item--section>.md-nav__link[for]{color:var(--md-default-fg-color--light)}.md-nav__item--section>.md-nav__link:not(.md-nav__container){pointer-events:none}.md-nav__item--section>.md-nav__link .md-icon,.md-nav__item--section>.md-nav__link>[for]{display:none}[dir=ltr] .md-nav__item--section>.md-nav{margin-left:-.6rem}[dir=rtl] .md-nav__item--section>.md-nav{margin-right:-.6rem}.md-nav__item--section>.md-nav{display:block;opacity:1;visibility:visible}.md-nav__item--section>.md-nav>.md-nav__list>.md-nav__item{padding:0}.md-nav__icon{border-radius:100%;height:.9rem;transition:background-color .25s;width:.9rem}.md-nav__icon:hover{background-color:var(--md-accent-fg-color--transparent)}.md-nav__icon:after{background-color:currentcolor;border-radius:100%;content:"";display:inline-block;height:100%;-webkit-mask-image:var(--md-nav-icon--next);mask-image:var(--md-nav-icon--next);-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;transition:transform .25s;vertical-align:-.1rem;width:100%}[dir=rtl] .md-nav__icon:after{transform:rotate(180deg)}.md-nav__item--nested .md-nav__toggle:checked~.md-nav__link .md-nav__icon:after,.md-nav__item--nested .md-toggle--indeterminate~.md-nav__link .md-nav__icon:after{transform:rotate(90deg)}.md-nav--lifted>.md-nav__list>.md-nav__item,.md-nav--lifted>.md-nav__title{display:none}.md-nav--lifted>.md-nav__list>.md-nav__item--active{display:block}.md-nav--lifted>.md-nav__list>.md-nav__item--active>.md-nav__link{background:var(--md-default-bg-color);box-shadow:0 0 .4rem .4rem var(--md-default-bg-color);margin-top:0;position:sticky;top:0;z-index:1}.md-nav--lifted>.md-nav__list>.md-nav__item--active>.md-nav__link:not(.md-nav__container){pointer-events:none}.md-nav--lifted>.md-nav__list>.md-nav__item--active.md-nav__item--section{margin:0}[dir=ltr] .md-nav--lifted>.md-nav__list>.md-nav__item>.md-nav:not(.md-nav--secondary){margin-left:-.6rem}[dir=rtl] .md-nav--lifted>.md-nav__list>.md-nav__item>.md-nav:not(.md-nav--secondary){margin-right:-.6rem}.md-nav--lifted>.md-nav__list>.md-nav__item>[for]{color:var(--md-default-fg-color--light)}.md-nav--lifted .md-nav[data-md-level="1"]{grid-template-rows:1fr;opacity:1;visibility:visible}[dir=ltr] .md-nav--integrated>.md-nav__list>.md-nav__item--active .md-nav--secondary{border-left:.05rem solid var(--md-primary-fg-color)}[dir=rtl] .md-nav--integrated>.md-nav__list>.md-nav__item--active .md-nav--secondary{border-right:.05rem solid var(--md-primary-fg-color)}.md-nav--integrated>.md-nav__list>.md-nav__item--active .md-nav--secondary{display:block;margin-bottom:1.25em;opacity:1;visibility:visible}.md-nav--integrated>.md-nav__list>.md-nav__item--active .md-nav--secondary>.md-nav__list{overflow:visible;padding-bottom:0}.md-nav--integrated>.md-nav__list>.md-nav__item--active .md-nav--secondary>.md-nav__title{display:none}}.md-pagination{font-size:.8rem;font-weight:700;gap:.4rem}.md-pagination,.md-pagination>*{align-items:center;display:flex;justify-content:center}.md-pagination>*{border-radius:.2rem;height:1.8rem;min-width:1.8rem;text-align:center}.md-pagination__current{background-color:var(--md-default-fg-color--lightest);color:var(--md-default-fg-color--light)}.md-pagination__link{transition:color 125ms,background-color 125ms}.md-pagination__link:focus,.md-pagination__link:hover{background-color:var(--md-accent-fg-color--transparent);color:var(--md-accent-fg-color)}.md-pagination__link:focus svg,.md-pagination__link:hover svg{color:var(--md-accent-fg-color)}.md-pagination__link.focus-visible{outline-color:var(--md-accent-fg-color);outline-offset:.2rem}.md-pagination__link svg{fill:currentcolor;color:var(--md-default-fg-color--lighter);display:block;max-height:100%;width:1.2rem}.md-post__back{border-bottom:.05rem solid var(--md-default-fg-color--lightest);margin-bottom:1.2rem;padding-bottom:1.2rem}@media screen and (max-width:76.234375em){.md-post__back{display:none}}[dir=rtl] .md-post__back svg{transform:scaleX(-1)}.md-post__authors{display:flex;flex-direction:column;gap:.6rem;margin:0 .6rem 1.2rem}.md-post .md-post__meta a{transition:color 125ms}.md-post .md-post__meta a:focus,.md-post .md-post__meta a:hover{color:var(--md-accent-fg-color)}.md-post__title{color:var(--md-default-fg-color--light);font-weight:700}.md-post--excerpt{margin-bottom:3.2rem}.md-post--excerpt .md-post__header{align-items:center;display:flex;gap:.6rem;min-height:1.6rem}.md-post--excerpt .md-post__authors{align-items:center;display:inline-flex;flex-direction:row;gap:.2rem;margin:0;min-height:2.4rem}[dir=ltr] .md-post--excerpt .md-post__meta .md-meta__list{margin-right:.4rem}[dir=rtl] .md-post--excerpt .md-post__meta .md-meta__list{margin-left:.4rem}.md-post--excerpt .md-post__content>:first-child{--md-scroll-margin:6rem;margin-top:0}.md-post>.md-nav--secondary{margin:1em 0}.md-profile{align-items:center;display:flex;font-size:.7rem;gap:.6rem;line-height:1.4;width:100%}.md-profile__description{flex-grow:1}.md-content--post{display:flex}@media screen and (max-width:76.234375em){.md-content--post{flex-flow:column-reverse}}.md-content--post>.md-content__inner{min-width:0}@media screen and (min-width:76.25em){[dir=ltr] .md-content--post>.md-content__inner{margin-left:1.2rem}[dir=rtl] .md-content--post>.md-content__inner{margin-right:1.2rem}}@media screen and (max-width:76.234375em){.md-sidebar.md-sidebar--post{padding:0;position:static;width:100%}.md-sidebar.md-sidebar--post .md-sidebar__scrollwrap{overflow:visible}.md-sidebar.md-sidebar--post .md-sidebar__inner{padding:0}.md-sidebar.md-sidebar--post .md-post__meta{margin-left:.6rem;margin-right:.6rem}.md-sidebar.md-sidebar--post .md-nav__item{border:none;display:inline}.md-sidebar.md-sidebar--post .md-nav__list{display:inline-flex;flex-wrap:wrap;gap:.6rem;padding-bottom:.6rem;padding-top:.6rem}.md-sidebar.md-sidebar--post .md-nav__link{padding:0}.md-sidebar.md-sidebar--post .md-nav{height:auto;margin-bottom:0;position:static}}:root{--md-progress-value:0;--md-progress-delay:400ms}.md-progress{background:var(--md-primary-bg-color);height:.075rem;opacity:min(clamp(0,var(--md-progress-value),1),clamp(0,100 - var(--md-progress-value),1));position:fixed;top:0;transform:scaleX(calc(var(--md-progress-value)*1%));transform-origin:left;transition:transform .5s cubic-bezier(.19,1,.22,1),opacity .25s var(--md-progress-delay);width:100%;z-index:4}:root{--md-search-result-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h7c-.41-.25-.8-.56-1.14-.9-.33-.33-.61-.7-.86-1.1H6V4h7v5h5v1.18c.71.16 1.39.43 2 .82V8l-6-6m6.31 16.9c1.33-2.11.69-4.9-1.4-6.22-2.11-1.33-4.91-.68-6.22 1.4-1.34 2.11-.69 4.89 1.4 6.22 1.46.93 3.32.93 4.79.02L22 23.39 23.39 22l-3.08-3.1m-3.81.1a2.5 2.5 0 0 1-2.5-2.5 2.5 2.5 0 0 1 2.5-2.5 2.5 2.5 0 0 1 2.5 2.5 2.5 2.5 0 0 1-2.5 2.5Z"/></svg>')}.md-search{position:relative}@media screen and (min-width:60em){.md-search{padding:.2rem 0}}.no-js .md-search{display:none}.md-search__overlay{opacity:0;z-index:1}@media screen and (max-width:59.984375em){[dir=ltr] .md-search__overlay{left:-2.2rem}[dir=rtl] .md-search__overlay{right:-2.2rem}.md-search__overlay{background-color:var(--md-default-bg-color);border-radius:1rem;height:2rem;overflow:hidden;pointer-events:none;position:absolute;top:-1rem;transform-origin:center;transition:transform .3s .1s,opacity .2s .2s;width:2rem}[data-md-toggle=search]:checked~.md-header .md-search__overlay{opacity:1;transition:transform .4s,opacity .1s}}@media screen and (min-width:60em){[dir=ltr] .md-search__overlay{left:0}[dir=rtl] .md-search__overlay{right:0}.md-search__overlay{background-color:#0000008a;cursor:pointer;height:0;position:fixed;top:0;transition:width 0ms .25s,height 0ms .25s,opacity .25s;width:0}[data-md-toggle=search]:checked~.md-header .md-search__overlay{height:200vh;opacity:1;transition:width 0ms,height 0ms,opacity .25s;width:100%}}@media screen and (max-width:29.984375em){[data-md-toggle=search]:checked~.md-header .md-search__overlay{transform:scale(45)}}@media screen and (min-width:30em) and (max-width:44.984375em){[data-md-toggle=search]:checked~.md-header .md-search__overlay{transform:scale(60)}}@media screen and (min-width:45em) and (max-width:59.984375em){[data-md-toggle=search]:checked~.md-header .md-search__overlay{transform:scale(75)}}.md-search__inner{-webkit-backface-visibility:hidden;backface-visibility:hidden}@media screen and (max-width:59.984375em){[dir=ltr] .md-search__inner{left:0}[dir=rtl] .md-search__inner{right:0}.md-search__inner{height:0;opacity:0;overflow:hidden;position:fixed;top:0;transform:translateX(5%);transition:width 0ms .3s,height 0ms .3s,transform .15s cubic-bezier(.4,0,.2,1) .15s,opacity .15s .15s;width:0;z-index:2}[dir=rtl] .md-search__inner{transform:translateX(-5%)}[data-md-toggle=search]:checked~.md-header .md-search__inner{height:100%;opacity:1;transform:translateX(0);transition:width 0ms 0ms,height 0ms 0ms,transform .15s cubic-bezier(.1,.7,.1,1) .15s,opacity .15s .15s;width:100%}}@media screen and (min-width:60em){[dir=ltr] .md-search__inner{float:right}[dir=rtl] .md-search__inner{float:left}.md-search__inner{padding:.1rem 0;position:relative;transition:width .25s cubic-bezier(.1,.7,.1,1);width:11.7rem}}@media screen and (min-width:60em) and (max-width:76.234375em){[data-md-toggle=search]:checked~.md-header .md-search__inner{width:23.4rem}}@media screen and (min-width:76.25em){[data-md-toggle=search]:checked~.md-header .md-search__inner{width:34.4rem}}.md-search__form{background-color:var(--md-default-bg-color);box-shadow:0 0 .6rem #0000;height:2.4rem;position:relative;transition:color .25s,background-color .25s;z-index:2}@media screen and (min-width:60em){.md-search__form{background-color:#00000042;border-radius:.1rem;height:1.8rem}.md-search__form:hover{background-color:#ffffff1f}}[data-md-toggle=search]:checked~.md-header .md-search__form{background-color:var(--md-default-bg-color);border-radius:.1rem .1rem 0 0;box-shadow:0 0 .6rem #00000012;color:var(--md-default-fg-color)}[dir=ltr] .md-search__input{padding-left:3.6rem;padding-right:2.2rem}[dir=rtl] .md-search__input{padding-left:2.2rem;padding-right:3.6rem}.md-search__input{background:#0000;font-size:.9rem;height:100%;position:relative;text-overflow:ellipsis;width:100%;z-index:2}.md-search__input::placeholder{transition:color .25s}.md-search__input::placeholder,.md-search__input~.md-search__icon{color:var(--md-default-fg-color--light)}.md-search__input::-ms-clear{display:none}@media screen and (max-width:59.984375em){.md-search__input{font-size:.9rem;height:2.4rem;width:100%}}@media screen and (min-width:60em){[dir=ltr] .md-search__input{padding-left:2.2rem}[dir=rtl] .md-search__input{padding-right:2.2rem}.md-search__input{color:inherit;font-size:.8rem}.md-search__input::placeholder{color:var(--md-primary-bg-color--light)}.md-search__input+.md-search__icon{color:var(--md-primary-bg-color)}[data-md-toggle=search]:checked~.md-header .md-search__input{text-overflow:clip}[data-md-toggle=search]:checked~.md-header .md-search__input+.md-search__icon{color:var(--md-default-fg-color--light)}[data-md-toggle=search]:checked~.md-header .md-search__input::placeholder{color:#0000}}.md-search__icon{cursor:pointer;display:inline-block;height:1.2rem;transition:color .25s,opacity .25s;width:1.2rem}.md-search__icon:hover{opacity:.7}[dir=ltr] .md-search__icon[for=__search]{left:.5rem}[dir=rtl] .md-search__icon[for=__search]{right:.5rem}.md-search__icon[for=__search]{position:absolute;top:.3rem;z-index:2}[dir=rtl] .md-search__icon[for=__search] svg{transform:scaleX(-1)}@media screen and (max-width:59.984375em){[dir=ltr] .md-search__icon[for=__search]{left:.8rem}[dir=rtl] .md-search__icon[for=__search]{right:.8rem}.md-search__icon[for=__search]{top:.6rem}.md-search__icon[for=__search] svg:first-child{display:none}}@media screen and (min-width:60em){.md-search__icon[for=__search]{pointer-events:none}.md-search__icon[for=__search] svg:last-child{display:none}}[dir=ltr] .md-search__options{right:.5rem}[dir=rtl] .md-search__options{left:.5rem}.md-search__options{pointer-events:none;position:absolute;top:.3rem;z-index:2}@media screen and (max-width:59.984375em){[dir=ltr] .md-search__options{right:.8rem}[dir=rtl] .md-search__options{left:.8rem}.md-search__options{top:.6rem}}[dir=ltr] .md-search__options>.md-icon{margin-left:.2rem}[dir=rtl] .md-search__options>.md-icon{margin-right:.2rem}.md-search__options>.md-icon{color:var(--md-default-fg-color--light);opacity:0;transform:scale(.75);transition:transform .15s cubic-bezier(.1,.7,.1,1),opacity .15s}.md-search__options>.md-icon:not(.focus-visible){-webkit-tap-highlight-color:transparent;outline:none}[data-md-toggle=search]:checked~.md-header .md-search__input:valid~.md-search__options>.md-icon{opacity:1;pointer-events:auto;transform:scale(1)}[data-md-toggle=search]:checked~.md-header .md-search__input:valid~.md-search__options>.md-icon:hover{opacity:.7}[dir=ltr] .md-search__suggest{padding-left:3.6rem;padding-right:2.2rem}[dir=rtl] .md-search__suggest{padding-left:2.2rem;padding-right:3.6rem}.md-search__suggest{align-items:center;color:var(--md-default-fg-color--lighter);display:flex;font-size:.9rem;height:100%;opacity:0;position:absolute;top:0;transition:opacity 50ms;white-space:nowrap;width:100%}@media screen and (min-width:60em){[dir=ltr] .md-search__suggest{padding-left:2.2rem}[dir=rtl] .md-search__suggest{padding-right:2.2rem}.md-search__suggest{font-size:.8rem}}[data-md-toggle=search]:checked~.md-header .md-search__suggest{opacity:1;transition:opacity .3s .1s}[dir=ltr] .md-search__output{border-bottom-left-radius:.1rem}[dir=ltr] .md-search__output,[dir=rtl] .md-search__output{border-bottom-right-radius:.1rem}[dir=rtl] .md-search__output{border-bottom-left-radius:.1rem}.md-search__output{overflow:hidden;position:absolute;width:100%;z-index:1}@media screen and (max-width:59.984375em){.md-search__output{bottom:0;top:2.4rem}}@media screen and (min-width:60em){.md-search__output{opacity:0;top:1.9rem;transition:opacity .4s}[data-md-toggle=search]:checked~.md-header .md-search__output{box-shadow:var(--md-shadow-z3);opacity:1}}.md-search__scrollwrap{-webkit-backface-visibility:hidden;backface-visibility:hidden;background-color:var(--md-default-bg-color);height:100%;overflow-y:auto;touch-action:pan-y}@media (-webkit-max-device-pixel-ratio:1),(max-resolution:1dppx){.md-search__scrollwrap{transform:translateZ(0)}}@media screen and (min-width:60em) and (max-width:76.234375em){.md-search__scrollwrap{width:23.4rem}}@media screen and (min-width:76.25em){.md-search__scrollwrap{width:34.4rem}}@media screen and (min-width:60em){.md-search__scrollwrap{max-height:0;scrollbar-color:var(--md-default-fg-color--lighter) #0000;scrollbar-width:thin}[data-md-toggle=search]:checked~.md-header .md-search__scrollwrap{max-height:75vh}.md-search__scrollwrap:hover{scrollbar-color:var(--md-accent-fg-color) #0000}.md-search__scrollwrap::-webkit-scrollbar{height:.2rem;width:.2rem}.md-search__scrollwrap::-webkit-scrollbar-thumb{background-color:var(--md-default-fg-color--lighter)}.md-search__scrollwrap::-webkit-scrollbar-thumb:hover{background-color:var(--md-accent-fg-color)}}.md-search-result{color:var(--md-default-fg-color);word-break:break-word}.md-search-result__meta{background-color:var(--md-default-fg-color--lightest);color:var(--md-default-fg-color--light);font-size:.64rem;line-height:1.8rem;padding:0 .8rem;scroll-snap-align:start}@media screen and (min-width:60em){[dir=ltr] .md-search-result__meta{padding-left:2.2rem}[dir=rtl] .md-search-result__meta{padding-right:2.2rem}}.md-search-result__list{list-style:none;margin:0;padding:0;-webkit-user-select:none;user-select:none}.md-search-result__item{box-shadow:0 -.05rem var(--md-default-fg-color--lightest)}.md-search-result__item:first-child{box-shadow:none}.md-search-result__link{display:block;outline:none;scroll-snap-align:start;transition:background-color .25s}.md-search-result__link:focus,.md-search-result__link:hover{background-color:var(--md-accent-fg-color--transparent)}.md-search-result__link:last-child p:last-child{margin-bottom:.6rem}.md-search-result__more>summary{cursor:pointer;display:block;outline:none;position:sticky;scroll-snap-align:start;top:0;z-index:1}.md-search-result__more>summary::marker{display:none}.md-search-result__more>summary::-webkit-details-marker{display:none}.md-search-result__more>summary>div{color:var(--md-typeset-a-color);font-size:.64rem;padding:.75em .8rem;transition:color .25s,background-color .25s}@media screen and (min-width:60em){[dir=ltr] .md-search-result__more>summary>div{padding-left:2.2rem}[dir=rtl] .md-search-result__more>summary>div{padding-right:2.2rem}}.md-search-result__more>summary:focus>div,.md-search-result__more>summary:hover>div{background-color:var(--md-accent-fg-color--transparent);color:var(--md-accent-fg-color)}.md-search-result__more[open]>summary{background-color:var(--md-default-bg-color)}.md-search-result__article{overflow:hidden;padding:0 .8rem;position:relative}@media screen and (min-width:60em){[dir=ltr] .md-search-result__article{padding-left:2.2rem}[dir=rtl] .md-search-result__article{padding-right:2.2rem}}[dir=ltr] .md-search-result__icon{left:0}[dir=rtl] .md-search-result__icon{right:0}.md-search-result__icon{color:var(--md-default-fg-color--light);height:1.2rem;margin:.5rem;position:absolute;width:1.2rem}@media screen and (max-width:59.984375em){.md-search-result__icon{display:none}}.md-search-result__icon:after{background-color:currentcolor;content:"";display:inline-block;height:100%;-webkit-mask-image:var(--md-search-result-icon);mask-image:var(--md-search-result-icon);-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;width:100%}[dir=rtl] .md-search-result__icon:after{transform:scaleX(-1)}.md-search-result .md-typeset{color:var(--md-default-fg-color--light);font-size:.64rem;line-height:1.6}.md-search-result .md-typeset h1{color:var(--md-default-fg-color);font-size:.8rem;font-weight:400;line-height:1.4;margin:.55rem 0}.md-search-result .md-typeset h1 mark{text-decoration:none}.md-search-result .md-typeset h2{color:var(--md-default-fg-color);font-size:.64rem;font-weight:700;line-height:1.6;margin:.5em 0}.md-search-result .md-typeset h2 mark{text-decoration:none}.md-search-result__terms{color:var(--md-default-fg-color);display:block;font-size:.64rem;font-style:italic;margin:.5em 0}.md-search-result mark{background-color:initial;color:var(--md-accent-fg-color);text-decoration:underline}.md-select{position:relative;z-index:1}.md-select__inner{background-color:var(--md-default-bg-color);border-radius:.1rem;box-shadow:var(--md-shadow-z2);color:var(--md-default-fg-color);left:50%;margin-top:.2rem;max-height:0;opacity:0;position:absolute;top:calc(100% - .2rem);transform:translate3d(-50%,.3rem,0);transition:transform .25s 375ms,opacity .25s .25s,max-height 0ms .5s}.md-select:focus-within .md-select__inner,.md-select:hover .md-select__inner{max-height:10rem;opacity:1;transform:translate3d(-50%,0,0);transition:transform .25s cubic-bezier(.1,.7,.1,1),opacity .25s,max-height 0ms}.md-select__inner:after{border-bottom:.2rem solid #0000;border-bottom-color:var(--md-default-bg-color);border-left:.2rem solid #0000;border-right:.2rem solid #0000;border-top:0;content:"";height:0;left:50%;margin-left:-.2rem;margin-top:-.2rem;position:absolute;top:0;width:0}.md-select__list{border-radius:.1rem;font-size:.8rem;list-style-type:none;margin:0;max-height:inherit;overflow:auto;padding:0}.md-select__item{line-height:1.8rem}[dir=ltr] .md-select__link{padding-left:.6rem;padding-right:1.2rem}[dir=rtl] .md-select__link{padding-left:1.2rem;padding-right:.6rem}.md-select__link{cursor:pointer;display:block;outline:none;scroll-snap-align:start;transition:background-color .25s,color .25s;width:100%}.md-select__link:focus,.md-select__link:hover{color:var(--md-accent-fg-color)}.md-select__link:focus{background-color:var(--md-default-fg-color--lightest)}.md-sidebar{align-self:flex-start;flex-shrink:0;padding:1.2rem 0;position:sticky;top:2.4rem;width:12.1rem}@media print{.md-sidebar{display:none}}@media screen and (max-width:76.234375em){[dir=ltr] .md-sidebar--primary{left:-12.1rem}[dir=rtl] .md-sidebar--primary{right:-12.1rem}.md-sidebar--primary{background-color:var(--md-default-bg-color);display:block;height:100%;position:fixed;top:0;transform:translateX(0);transition:transform .25s cubic-bezier(.4,0,.2,1),box-shadow .25s;width:12.1rem;z-index:5}[data-md-toggle=drawer]:checked~.md-container .md-sidebar--primary{box-shadow:var(--md-shadow-z3);transform:translateX(12.1rem)}[dir=rtl] [data-md-toggle=drawer]:checked~.md-container .md-sidebar--primary{transform:translateX(-12.1rem)}.md-sidebar--primary .md-sidebar__scrollwrap{bottom:0;left:0;margin:0;overflow:hidden;position:absolute;right:0;scroll-snap-type:none;top:0}}@media screen and (min-width:76.25em){.md-sidebar{height:0}.no-js .md-sidebar{height:auto}.md-header--lifted~.md-container .md-sidebar{top:4.8rem}}.md-sidebar--secondary{display:none;order:2}@media screen and (min-width:60em){.md-sidebar--secondary{height:0}.no-js .md-sidebar--secondary{height:auto}.md-sidebar--secondary:not([hidden]){display:block}.md-sidebar--secondary .md-sidebar__scrollwrap{touch-action:pan-y}}.md-sidebar__scrollwrap{scrollbar-gutter:stable;-webkit-backface-visibility:hidden;backface-visibility:hidden;margin:0 .2rem;overflow-y:auto;scrollbar-color:var(--md-default-fg-color--lighter) #0000;scrollbar-width:thin}.md-sidebar__scrollwrap::-webkit-scrollbar{height:.2rem;width:.2rem}.md-sidebar__scrollwrap:focus-within,.md-sidebar__scrollwrap:hover{scrollbar-color:var(--md-accent-fg-color) #0000}.md-sidebar__scrollwrap:focus-within::-webkit-scrollbar-thumb,.md-sidebar__scrollwrap:hover::-webkit-scrollbar-thumb{background-color:var(--md-default-fg-color--lighter)}.md-sidebar__scrollwrap:focus-within::-webkit-scrollbar-thumb:hover,.md-sidebar__scrollwrap:hover::-webkit-scrollbar-thumb:hover{background-color:var(--md-accent-fg-color)}@supports selector(::-webkit-scrollbar){.md-sidebar__scrollwrap{scrollbar-gutter:auto}[dir=ltr] .md-sidebar__inner{padding-right:calc(100% - 11.5rem)}[dir=rtl] .md-sidebar__inner{padding-left:calc(100% - 11.5rem)}}@media screen and (max-width:76.234375em){.md-overlay{background-color:#0000008a;height:0;opacity:0;position:fixed;top:0;transition:width 0ms .25s,height 0ms .25s,opacity .25s;width:0;z-index:5}[data-md-toggle=drawer]:checked~.md-overlay{height:100%;opacity:1;transition:width 0ms,height 0ms,opacity .25s;width:100%}}@keyframes facts{0%{height:0}to{height:.65rem}}@keyframes fact{0%{opacity:0;transform:translateY(100%)}50%{opacity:0}to{opacity:1;transform:translateY(0)}}:root{--md-source-forks-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path d="M5 5.372v.878c0 .414.336.75.75.75h4.5a.75.75 0 0 0 .75-.75v-.878a2.25 2.25 0 1 1 1.5 0v.878a2.25 2.25 0 0 1-2.25 2.25h-1.5v2.128a2.251 2.251 0 1 1-1.5 0V8.5h-1.5A2.25 2.25 0 0 1 3.5 6.25v-.878a2.25 2.25 0 1 1 1.5 0ZM5 3.25a.75.75 0 1 0-1.5 0 .75.75 0 0 0 1.5 0Zm6.75.75a.75.75 0 1 0 0-1.5.75.75 0 0 0 0 1.5Zm-3 8.75a.75.75 0 1 0-1.5 0 .75.75 0 0 0 1.5 0Z"/></svg>');--md-source-repositories-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path d="M2 2.5A2.5 2.5 0 0 1 4.5 0h8.75a.75.75 0 0 1 .75.75v12.5a.75.75 0 0 1-.75.75h-2.5a.75.75 0 0 1 0-1.5h1.75v-2h-8a1 1 0 0 0-.714 1.7.75.75 0 1 1-1.072 1.05A2.495 2.495 0 0 1 2 11.5Zm10.5-1h-8a1 1 0 0 0-1 1v6.708A2.486 2.486 0 0 1 4.5 9h8ZM5 12.25a.25.25 0 0 1 .25-.25h3.5a.25.25 0 0 1 .25.25v3.25a.25.25 0 0 1-.4.2l-1.45-1.087a.249.249 0 0 0-.3 0L5.4 15.7a.25.25 0 0 1-.4-.2Z"/></svg>');--md-source-stars-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path d="M8 .25a.75.75 0 0 1 .673.418l1.882 3.815 4.21.612a.75.75 0 0 1 .416 1.279l-3.046 2.97.719 4.192a.751.751 0 0 1-1.088.791L8 12.347l-3.766 1.98a.75.75 0 0 1-1.088-.79l.72-4.194L.818 6.374a.75.75 0 0 1 .416-1.28l4.21-.611L7.327.668A.75.75 0 0 1 8 .25Zm0 2.445L6.615 5.5a.75.75 0 0 1-.564.41l-3.097.45 2.24 2.184a.75.75 0 0 1 .216.664l-.528 3.084 2.769-1.456a.75.75 0 0 1 .698 0l2.77 1.456-.53-3.084a.75.75 0 0 1 .216-.664l2.24-2.183-3.096-.45a.75.75 0 0 1-.564-.41L8 2.694Z"/></svg>');--md-source-version-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path d="M1 7.775V2.75C1 1.784 1.784 1 2.75 1h5.025c.464 0 .91.184 1.238.513l6.25 6.25a1.75 1.75 0 0 1 0 2.474l-5.026 5.026a1.75 1.75 0 0 1-2.474 0l-6.25-6.25A1.752 1.752 0 0 1 1 7.775Zm1.5 0c0 .066.026.13.073.177l6.25 6.25a.25.25 0 0 0 .354 0l5.025-5.025a.25.25 0 0 0 0-.354l-6.25-6.25a.25.25 0 0 0-.177-.073H2.75a.25.25 0 0 0-.25.25ZM6 5a1 1 0 1 1 0 2 1 1 0 0 1 0-2Z"/></svg>')}.md-source{-webkit-backface-visibility:hidden;backface-visibility:hidden;display:block;font-size:.65rem;line-height:1.2;outline-color:var(--md-accent-fg-color);transition:opacity .25s;white-space:nowrap}.md-source:hover{opacity:.7}.md-source__icon{display:inline-block;height:2.4rem;vertical-align:middle;width:2rem}[dir=ltr] .md-source__icon svg{margin-left:.6rem}[dir=rtl] .md-source__icon svg{margin-right:.6rem}.md-source__icon svg{margin-top:.6rem}[dir=ltr] .md-source__icon+.md-source__repository{padding-left:2rem}[dir=rtl] .md-source__icon+.md-source__repository{padding-right:2rem}[dir=ltr] .md-source__icon+.md-source__repository{margin-left:-2rem}[dir=rtl] .md-source__icon+.md-source__repository{margin-right:-2rem}[dir=ltr] .md-source__repository{margin-left:.6rem}[dir=rtl] .md-source__repository{margin-right:.6rem}.md-source__repository{display:inline-block;max-width:calc(100% - 1.2rem);overflow:hidden;text-overflow:ellipsis;vertical-align:middle}.md-source__facts{display:flex;font-size:.55rem;gap:.4rem;list-style-type:none;margin:.1rem 0 0;opacity:.75;overflow:hidden;padding:0;width:100%}.md-source__repository--active .md-source__facts{animation:facts .25s ease-in}.md-source__fact{overflow:hidden;text-overflow:ellipsis}.md-source__repository--active .md-source__fact{animation:fact .4s ease-out}[dir=ltr] .md-source__fact:before{margin-right:.1rem}[dir=rtl] .md-source__fact:before{margin-left:.1rem}.md-source__fact:before{background-color:currentcolor;content:"";display:inline-block;height:.6rem;-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;vertical-align:text-top;width:.6rem}.md-source__fact:nth-child(1n+2){flex-shrink:0}.md-source__fact--version:before{-webkit-mask-image:var(--md-source-version-icon);mask-image:var(--md-source-version-icon)}.md-source__fact--stars:before{-webkit-mask-image:var(--md-source-stars-icon);mask-image:var(--md-source-stars-icon)}.md-source__fact--forks:before{-webkit-mask-image:var(--md-source-forks-icon);mask-image:var(--md-source-forks-icon)}.md-source__fact--repositories:before{-webkit-mask-image:var(--md-source-repositories-icon);mask-image:var(--md-source-repositories-icon)}.md-source-file{margin:1em 0}[dir=ltr] .md-source-file__fact{margin-right:.6rem}[dir=rtl] .md-source-file__fact{margin-left:.6rem}.md-source-file__fact{align-items:center;color:var(--md-default-fg-color--light);display:inline-flex;font-size:.68rem;gap:.3rem}.md-source-file__fact .md-icon{flex-shrink:0;margin-bottom:.05rem}[dir=ltr] .md-source-file__fact .md-author{float:left}[dir=rtl] .md-source-file__fact .md-author{float:right}.md-source-file__fact .md-author{margin-right:.2rem}.md-source-file__fact svg{width:.9rem}:root{--md-status:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M11 9h2V7h-2m1 13c-4.41 0-8-3.59-8-8s3.59-8 8-8 8 3.59 8 8-3.59 8-8 8m0-18A10 10 0 0 0 2 12a10 10 0 0 0 10 10 10 10 0 0 0 10-10A10 10 0 0 0 12 2m-1 15h2v-6h-2v6Z"/></svg>');--md-status--new:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m23 12-2.44-2.78.34-3.68-3.61-.82-1.89-3.18L12 3 8.6 1.54 6.71 4.72l-3.61.81.34 3.68L1 12l2.44 2.78-.34 3.69 3.61.82 1.89 3.18L12 21l3.4 1.46 1.89-3.18 3.61-.82-.34-3.68L23 12m-10 5h-2v-2h2v2m0-4h-2V7h2v6Z"/></svg>');--md-status--deprecated:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9 3v1H4v2h1v13a2 2 0 0 0 2 2h10a2 2 0 0 0 2-2V6h1V4h-5V3H9m0 5h2v9H9V8m4 0h2v9h-2V8Z"/></svg>');--md-status--encrypted:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 1 3 5v6c0 5.55 3.84 10.74 9 12 5.16-1.26 9-6.45 9-12V5l-9-4m0 6c1.4 0 2.8 1.1 2.8 2.5V11c.6 0 1.2.6 1.2 1.3v3.5c0 .6-.6 1.2-1.3 1.2H9.2c-.6 0-1.2-.6-1.2-1.3v-3.5c0-.6.6-1.2 1.2-1.2V9.5C9.2 8.1 10.6 7 12 7m0 1.2c-.8 0-1.5.5-1.5 1.3V11h3V9.5c0-.8-.7-1.3-1.5-1.3Z"/></svg>')}.md-status:after{background-color:var(--md-default-fg-color--light);content:"";display:inline-block;height:1.125em;-webkit-mask-image:var(--md-status);mask-image:var(--md-status);-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;vertical-align:text-bottom;width:1.125em}.md-status:hover:after{background-color:currentcolor}.md-status--new:after{-webkit-mask-image:var(--md-status--new);mask-image:var(--md-status--new)}.md-status--deprecated:after{-webkit-mask-image:var(--md-status--deprecated);mask-image:var(--md-status--deprecated)}.md-status--encrypted:after{-webkit-mask-image:var(--md-status--encrypted);mask-image:var(--md-status--encrypted)}.md-tabs{background-color:var(--md-primary-fg-color);color:var(--md-primary-bg-color);display:block;line-height:1.3;overflow:auto;width:100%;z-index:3}@media print{.md-tabs{display:none}}@media screen and (max-width:76.234375em){.md-tabs{display:none}}.md-tabs[hidden]{pointer-events:none}[dir=ltr] .md-tabs__list{margin-left:.2rem}[dir=rtl] .md-tabs__list{margin-right:.2rem}.md-tabs__list{contain:content;display:flex;list-style:none;margin:0;overflow:auto;padding:0;scrollbar-width:none;white-space:nowrap}.md-tabs__list::-webkit-scrollbar{display:none}.md-tabs__item{height:2.4rem;padding-left:.6rem;padding-right:.6rem}.md-tabs__item--active .md-tabs__link{color:inherit;opacity:1}.md-tabs__link{-webkit-backface-visibility:hidden;backface-visibility:hidden;display:flex;font-size:.7rem;margin-top:.8rem;opacity:.7;outline-color:var(--md-accent-fg-color);outline-offset:.2rem;transition:transform .4s cubic-bezier(.1,.7,.1,1),opacity .25s}.md-tabs__link:focus,.md-tabs__link:hover{color:inherit;opacity:1}[dir=ltr] .md-tabs__link svg{margin-right:.4rem}[dir=rtl] .md-tabs__link svg{margin-left:.4rem}.md-tabs__link svg{fill:currentcolor;height:1.3em}.md-tabs__item:nth-child(2) .md-tabs__link{transition-delay:20ms}.md-tabs__item:nth-child(3) .md-tabs__link{transition-delay:40ms}.md-tabs__item:nth-child(4) .md-tabs__link{transition-delay:60ms}.md-tabs__item:nth-child(5) .md-tabs__link{transition-delay:80ms}.md-tabs__item:nth-child(6) .md-tabs__link{transition-delay:.1s}.md-tabs__item:nth-child(7) .md-tabs__link{transition-delay:.12s}.md-tabs__item:nth-child(8) .md-tabs__link{transition-delay:.14s}.md-tabs__item:nth-child(9) .md-tabs__link{transition-delay:.16s}.md-tabs__item:nth-child(10) .md-tabs__link{transition-delay:.18s}.md-tabs__item:nth-child(11) .md-tabs__link{transition-delay:.2s}.md-tabs__item:nth-child(12) .md-tabs__link{transition-delay:.22s}.md-tabs__item:nth-child(13) .md-tabs__link{transition-delay:.24s}.md-tabs__item:nth-child(14) .md-tabs__link{transition-delay:.26s}.md-tabs__item:nth-child(15) .md-tabs__link{transition-delay:.28s}.md-tabs__item:nth-child(16) .md-tabs__link{transition-delay:.3s}.md-tabs[hidden] .md-tabs__link{opacity:0;transform:translateY(50%);transition:transform 0ms .1s,opacity .1s}:root{--md-tag-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m5.41 21 .71-4h-4l.35-2h4l1.06-6h-4l.35-2h4l.71-4h2l-.71 4h6l.71-4h2l-.71 4h4l-.35 2h-4l-1.06 6h4l-.35 2h-4l-.71 4h-2l.71-4h-6l-.71 4h-2M9.53 9l-1.06 6h6l1.06-6h-6Z"/></svg>')}.md-typeset .md-tags:not([hidden]){display:inline-flex;flex-wrap:wrap;gap:.5em;margin-bottom:.75em;margin-top:-.125em}.md-typeset .md-tag{align-items:center;background:var(--md-default-fg-color--lightest);border-radius:2.4rem;display:inline-flex;font-size:.64rem;font-size:min(.8em,.64rem);font-weight:700;gap:.5em;letter-spacing:normal;line-height:1.6;padding:.3125em .78125em}.md-typeset .md-tag[href]{-webkit-tap-highlight-color:transparent;color:inherit;outline:none;transition:color 125ms,background-color 125ms}.md-typeset .md-tag[href]:focus,.md-typeset .md-tag[href]:hover{background-color:var(--md-accent-fg-color);color:var(--md-accent-bg-color)}[id]>.md-typeset .md-tag{vertical-align:text-top}.md-typeset .md-tag-icon:before{background-color:var(--md-default-fg-color--lighter);content:"";display:inline-block;height:1.2em;-webkit-mask-image:var(--md-tag-icon);mask-image:var(--md-tag-icon);-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;transition:background-color 125ms;vertical-align:text-bottom;width:1.2em}.md-typeset .md-tag-icon[href]:focus:before,.md-typeset .md-tag-icon[href]:hover:before{background-color:var(--md-accent-bg-color)}@keyframes pulse{0%{transform:scale(.95)}75%{transform:scale(1)}to{transform:scale(.95)}}:root{--md-annotation-bg-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 2A10 10 0 0 0 2 12a10 10 0 0 0 10 10 10 10 0 0 0 10-10A10 10 0 0 0 12 2Z"/></svg>');--md-annotation-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17 13h-4v4h-2v-4H7v-2h4V7h2v4h4m-5-9A10 10 0 0 0 2 12a10 10 0 0 0 10 10 10 10 0 0 0 10-10A10 10 0 0 0 12 2Z"/></svg>')}.md-tooltip{-webkit-backface-visibility:hidden;backface-visibility:hidden;background-color:var(--md-default-bg-color);border-radius:.1rem;box-shadow:var(--md-shadow-z2);color:var(--md-default-fg-color);font-family:var(--md-text-font-family);left:clamp(var(--md-tooltip-0,0rem) + .8rem,var(--md-tooltip-x),100vw + var(--md-tooltip-0,0rem) + .8rem - var(--md-tooltip-width) - 2 * .8rem);max-width:calc(100vw - 1.6rem);opacity:0;position:absolute;top:var(--md-tooltip-y);transform:translateY(-.4rem);transition:transform 0ms .25s,opacity .25s,z-index .25s;width:var(--md-tooltip-width);z-index:0}.md-tooltip--active{opacity:1;transform:translateY(0);transition:transform .25s cubic-bezier(.1,.7,.1,1),opacity .25s,z-index 0ms;z-index:2}.md-tooltip--inline{font-weight:700;-webkit-user-select:none;user-select:none;width:auto}.md-tooltip--inline:not(.md-tooltip--active){transform:translateY(.2rem) scale(.9)}.md-tooltip--inline .md-tooltip__inner{font-size:.5rem;padding:.2rem .4rem}[hidden]+.md-tooltip--inline{display:none}.focus-visible>.md-tooltip,.md-tooltip:target{outline:var(--md-accent-fg-color) auto}.md-tooltip__inner{font-size:.64rem;padding:.8rem}.md-tooltip__inner.md-typeset>:first-child{margin-top:0}.md-tooltip__inner.md-typeset>:last-child{margin-bottom:0}.md-annotation{font-style:normal;font-weight:400;outline:none;text-align:initial;vertical-align:text-bottom;white-space:normal}[dir=rtl] .md-annotation{direction:rtl}code .md-annotation{font-family:var(--md-code-font-family);font-size:inherit}.md-annotation:not([hidden]){display:inline-block;line-height:1.25}.md-annotation__index{border-radius:.01px;cursor:pointer;display:inline-block;margin-left:.4ch;margin-right:.4ch;outline:none;overflow:hidden;position:relative;-webkit-user-select:none;user-select:none;vertical-align:text-top;z-index:0}.md-annotation .md-annotation__index{transition:z-index .25s}@media screen{.md-annotation__index{width:2.2ch}[data-md-visible]>.md-annotation__index{animation:pulse 2s infinite}.md-annotation__index:before{background:var(--md-default-bg-color);-webkit-mask-image:var(--md-annotation-bg-icon);mask-image:var(--md-annotation-bg-icon)}.md-annotation__index:after,.md-annotation__index:before{content:"";height:2.2ch;-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;position:absolute;top:-.1ch;width:2.2ch;z-index:-1}.md-annotation__index:after{background-color:var(--md-default-fg-color--lighter);-webkit-mask-image:var(--md-annotation-icon);mask-image:var(--md-annotation-icon);transform:scale(1.0001);transition:background-color .25s,transform .25s}.md-tooltip--active+.md-annotation__index:after{transform:rotate(45deg)}.md-tooltip--active+.md-annotation__index:after,:hover>.md-annotation__index:after{background-color:var(--md-accent-fg-color)}}.md-tooltip--active+.md-annotation__index{animation-play-state:paused;transition-duration:0ms;z-index:2}.md-annotation__index [data-md-annotation-id]{display:inline-block}@media print{.md-annotation__index [data-md-annotation-id]{background:var(--md-default-fg-color--lighter);border-radius:2ch;color:var(--md-default-bg-color);font-weight:700;padding:0 .6ch;white-space:nowrap}.md-annotation__index [data-md-annotation-id]:after{content:attr(data-md-annotation-id)}}.md-typeset .md-annotation-list{counter-reset:xxx;list-style:none}.md-typeset .md-annotation-list li{position:relative}[dir=ltr] .md-typeset .md-annotation-list li:before{left:-2.125em}[dir=rtl] .md-typeset .md-annotation-list li:before{right:-2.125em}.md-typeset .md-annotation-list li:before{background:var(--md-default-fg-color--lighter);border-radius:2ch;color:var(--md-default-bg-color);content:counter(xxx);counter-increment:xxx;font-size:.8875em;font-weight:700;height:2ch;line-height:1.25;min-width:2ch;padding:0 .6ch;position:absolute;text-align:center;top:.25em}:root{--md-tooltip-width:20rem;--md-tooltip-tail:0.3rem}.md-tooltip2{-webkit-backface-visibility:hidden;backface-visibility:hidden;color:var(--md-default-fg-color);font-family:var(--md-text-font-family);opacity:0;pointer-events:none;position:absolute;top:calc(var(--md-tooltip-host-y) + var(--md-tooltip-y));transform:translateY(-.4rem);transform-origin:calc(var(--md-tooltip-host-x) + var(--md-tooltip-x)) 0;transition:transform 0ms .25s,opacity .25s,z-index .25s;width:100%;z-index:0}.md-tooltip2:before{border-left:var(--md-tooltip-tail) solid #0000;border-right:var(--md-tooltip-tail) solid #0000;content:"";display:block;left:clamp(1.5 * .8rem,var(--md-tooltip-host-x) + var(--md-tooltip-x) - var(--md-tooltip-tail),100vw - 2 * var(--md-tooltip-tail) - 1.5 * .8rem);position:absolute;z-index:1}.md-tooltip2--top:before{border-top:var(--md-tooltip-tail) solid var(--md-default-bg-color);bottom:calc(var(--md-tooltip-tail)*-1 + .025rem);filter:drop-shadow(0 1px 0 hsla(0,0%,0%,.05))}.md-tooltip2--bottom:before{border-bottom:var(--md-tooltip-tail) solid var(--md-default-bg-color);filter:drop-shadow(0 -1px 0 hsla(0,0%,0%,.05));top:calc(var(--md-tooltip-tail)*-1 + .025rem)}.md-tooltip2--active{opacity:1;transform:translateY(0);transition:transform .4s cubic-bezier(0,1,.5,1),opacity .25s,z-index 0ms;z-index:2}.md-tooltip2__inner{scrollbar-gutter:stable;background-color:var(--md-default-bg-color);border-radius:.1rem;box-shadow:var(--md-shadow-z2);left:clamp(.8rem,var(--md-tooltip-host-x) - .8rem,100vw - var(--md-tooltip-width) - .8rem);max-height:40vh;max-width:calc(100vw - 1.6rem);position:relative;scrollbar-width:thin}.md-tooltip2__inner::-webkit-scrollbar{height:.2rem;width:.2rem}.md-tooltip2__inner::-webkit-scrollbar-thumb{background-color:var(--md-default-fg-color--lighter)}.md-tooltip2__inner::-webkit-scrollbar-thumb:hover{background-color:var(--md-accent-fg-color)}[role=tooltip]>.md-tooltip2__inner{font-size:.5rem;font-weight:700;left:clamp(.8rem,var(--md-tooltip-host-x) + var(--md-tooltip-x) - var(--md-tooltip-width)/2,100vw - var(--md-tooltip-width) - .8rem);max-width:min(100vw - 2 * .8rem,400px);padding:.2rem .4rem;-webkit-user-select:none;user-select:none;width:-moz-fit-content;width:fit-content}.md-tooltip2__inner.md-typeset>:first-child{margin-top:0}.md-tooltip2__inner.md-typeset>:last-child{margin-bottom:0}[dir=ltr] .md-top{margin-left:50%}[dir=rtl] .md-top{margin-right:50%}.md-top{background-color:var(--md-default-bg-color);border-radius:1.6rem;box-shadow:var(--md-shadow-z2);color:var(--md-default-fg-color--light);cursor:pointer;display:block;font-size:.7rem;outline:none;padding:.4rem .8rem;position:fixed;top:3.2rem;transform:translate(-50%);transition:color 125ms,background-color 125ms,transform 125ms cubic-bezier(.4,0,.2,1),opacity 125ms;z-index:2}@media print{.md-top{display:none}}[dir=rtl] .md-top{transform:translate(50%)}.md-top[hidden]{opacity:0;pointer-events:none;transform:translate(-50%,.2rem);transition-duration:0ms}[dir=rtl] .md-top[hidden]{transform:translate(50%,.2rem)}.md-top:focus,.md-top:hover{background-color:var(--md-accent-fg-color);color:var(--md-accent-bg-color)}.md-top svg{display:inline-block;vertical-align:-.5em}@keyframes hoverfix{0%{pointer-events:none}}:root{--md-version-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 320 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M137.4 374.6c12.5 12.5 32.8 12.5 45.3 0l128-128c9.2-9.2 11.9-22.9 6.9-34.9S301 191.9 288 191.9L32 192c-12.9 0-24.6 7.8-29.6 19.8s-2.2 25.7 6.9 34.9l128 128z"/></svg>')}.md-version{flex-shrink:0;font-size:.8rem;height:2.4rem}[dir=ltr] .md-version__current{margin-left:1.4rem;margin-right:.4rem}[dir=rtl] .md-version__current{margin-left:.4rem;margin-right:1.4rem}.md-version__current{color:inherit;cursor:pointer;outline:none;position:relative;top:.05rem}[dir=ltr] .md-version__current:after{margin-left:.4rem}[dir=rtl] .md-version__current:after{margin-right:.4rem}.md-version__current:after{background-color:currentcolor;content:"";display:inline-block;height:.6rem;-webkit-mask-image:var(--md-version-icon);mask-image:var(--md-version-icon);-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;width:.4rem}.md-version__alias{margin-left:.3rem;opacity:.7}.md-version__list{background-color:var(--md-default-bg-color);border-radius:.1rem;box-shadow:var(--md-shadow-z2);color:var(--md-default-fg-color);list-style-type:none;margin:.2rem .8rem;max-height:0;opacity:0;overflow:auto;padding:0;position:absolute;scroll-snap-type:y mandatory;top:.15rem;transition:max-height 0ms .5s,opacity .25s .25s;z-index:3}.md-version:focus-within .md-version__list,.md-version:hover .md-version__list{max-height:10rem;opacity:1;transition:max-height 0ms,opacity .25s}@media (hover:none),(pointer:coarse){.md-version:hover .md-version__list{animation:hoverfix .25s forwards}.md-version:focus-within .md-version__list{animation:none}}.md-version__item{line-height:1.8rem}[dir=ltr] .md-version__link{padding-left:.6rem;padding-right:1.2rem}[dir=rtl] .md-version__link{padding-left:1.2rem;padding-right:.6rem}.md-version__link{cursor:pointer;display:block;outline:none;scroll-snap-align:start;transition:color .25s,background-color .25s;white-space:nowrap;width:100%}.md-version__link:focus,.md-version__link:hover{color:var(--md-accent-fg-color)}.md-version__link:focus{background-color:var(--md-default-fg-color--lightest)}:root{--md-admonition-icon--note:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 2C6.47 2 2 6.47 2 12s4.47 10 10 10 10-4.47 10-10S17.53 2 12 2m3.1 5.07c.14 0 .28.05.4.16l1.27 1.27c.23.22.23.57 0 .78l-1 1-2.05-2.05 1-1c.1-.11.24-.16.38-.16m-1.97 1.74 2.06 2.06-6.06 6.06H7.07v-2.06l6.06-6.06Z"/></svg>');--md-admonition-icon--abstract:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17 9H7V7h10m0 6H7v-2h10m-3 6H7v-2h7M12 3a1 1 0 0 1 1 1 1 1 0 0 1-1 1 1 1 0 0 1-1-1 1 1 0 0 1 1-1m7 0h-4.18C14.4 1.84 13.3 1 12 1c-1.3 0-2.4.84-2.82 2H5a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2V5a2 2 0 0 0-2-2Z"/></svg>');--md-admonition-icon--info:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M13 9h-2V7h2m0 10h-2v-6h2m-1-9A10 10 0 0 0 2 12a10 10 0 0 0 10 10 10 10 0 0 0 10-10A10 10 0 0 0 12 2Z"/></svg>');--md-admonition-icon--tip:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17.66 11.2c-.23-.3-.51-.56-.77-.82-.67-.6-1.43-1.03-2.07-1.66C13.33 7.26 13 4.85 13.95 3c-.95.23-1.78.75-2.49 1.32-2.59 2.08-3.61 5.75-2.39 8.9.04.1.08.2.08.33 0 .22-.15.42-.35.5-.23.1-.47.04-.66-.12a.58.58 0 0 1-.14-.17c-1.13-1.43-1.31-3.48-.55-5.12C5.78 10 4.87 12.3 5 14.47c.06.5.12 1 .29 1.5.14.6.41 1.2.71 1.73 1.08 1.73 2.95 2.97 4.96 3.22 2.14.27 4.43-.12 6.07-1.6 1.83-1.66 2.47-4.32 1.53-6.6l-.13-.26c-.21-.46-.77-1.26-.77-1.26m-3.16 6.3c-.28.24-.74.5-1.1.6-1.12.4-2.24-.16-2.9-.82 1.19-.28 1.9-1.16 2.11-2.05.17-.8-.15-1.46-.28-2.23-.12-.74-.1-1.37.17-2.06.19.38.39.76.63 1.06.77 1 1.98 1.44 2.24 2.8.04.14.06.28.06.43.03.82-.33 1.72-.93 2.27Z"/></svg>');--md-admonition-icon--success:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 7 9 19l-5.5-5.5 1.41-1.41L9 16.17 19.59 5.59 21 7Z"/></svg>');--md-admonition-icon--question:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m15.07 11.25-.9.92C13.45 12.89 13 13.5 13 15h-2v-.5c0-1.11.45-2.11 1.17-2.83l1.24-1.26c.37-.36.59-.86.59-1.41a2 2 0 0 0-2-2 2 2 0 0 0-2 2H8a4 4 0 0 1 4-4 4 4 0 0 1 4 4 3.2 3.2 0 0 1-.93 2.25M13 19h-2v-2h2M12 2A10 10 0 0 0 2 12a10 10 0 0 0 10 10 10 10 0 0 0 10-10c0-5.53-4.5-10-10-10Z"/></svg>');--md-admonition-icon--warning:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M13 14h-2V9h2m0 9h-2v-2h2M1 21h22L12 2 1 21Z"/></svg>');--md-admonition-icon--failure:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>');--md-admonition-icon--danger:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m11.5 20 4.86-9.73H13V4l-5 9.73h3.5V20M12 2c2.75 0 5.1 1 7.05 2.95C21 6.9 22 9.25 22 12s-1 5.1-2.95 7.05C17.1 21 14.75 22 12 22s-5.1-1-7.05-2.95C3 17.1 2 14.75 2 12s1-5.1 2.95-7.05C6.9 3 9.25 2 12 2Z"/></svg>');--md-admonition-icon--bug:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M11 13h2v1h-2v-1m10-8v6c0 5.5-3.8 10.7-9 12-5.2-1.3-9-6.5-9-12V5l9-4 9 4m-4 5h-2.2c-.2-.6-.6-1.1-1.1-1.5l1.2-1.2-.7-.7L12.8 8H12c-.2 0-.5 0-.7.1L9.9 6.6l-.8.8 1.2 1.2c-.5.3-.9.8-1.1 1.4H7v1h2v1H7v1h2v1H7v1h2.2c.4 1.2 1.5 2 2.8 2s2.4-.8 2.8-2H17v-1h-2v-1h2v-1h-2v-1h2v-1m-6 2h2v-1h-2v1Z"/></svg>');--md-admonition-icon--example:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M7 2v2h1v14a4 4 0 0 0 4 4 4 4 0 0 0 4-4V4h1V2H7m4 14c-.6 0-1-.4-1-1s.4-1 1-1 1 .4 1 1-.4 1-1 1m2-4c-.6 0-1-.4-1-1s.4-1 1-1 1 .4 1 1-.4 1-1 1m1-5h-4V4h4v3Z"/></svg>');--md-admonition-icon--quote:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14 17h3l2-4V7h-6v6h3M6 17h3l2-4V7H5v6h3l-2 4Z"/></svg>')}.md-typeset .admonition,.md-typeset details{background-color:var(--md-admonition-bg-color);border:.075rem solid #448aff;border-radius:.2rem;box-shadow:var(--md-shadow-z1);color:var(--md-admonition-fg-color);display:flow-root;font-size:.64rem;margin:1.5625em 0;padding:0 .6rem;page-break-inside:avoid;transition:box-shadow 125ms}@media print{.md-typeset .admonition,.md-typeset details{box-shadow:none}}.md-typeset .admonition:focus-within,.md-typeset details:focus-within{box-shadow:0 0 0 .2rem #448aff1a}.md-typeset .admonition>*,.md-typeset details>*{box-sizing:border-box}.md-typeset .admonition .admonition,.md-typeset .admonition details,.md-typeset details .admonition,.md-typeset details details{margin-bottom:1em;margin-top:1em}.md-typeset .admonition .md-typeset__scrollwrap,.md-typeset details .md-typeset__scrollwrap{margin:1em -.6rem}.md-typeset .admonition .md-typeset__table,.md-typeset details .md-typeset__table{padding:0 .6rem}.md-typeset .admonition>.tabbed-set:only-child,.md-typeset details>.tabbed-set:only-child{margin-top:0}html .md-typeset .admonition>:last-child,html .md-typeset details>:last-child{margin-bottom:.6rem}[dir=ltr] .md-typeset .admonition-title,[dir=ltr] .md-typeset summary{padding-left:2rem;padding-right:.6rem}[dir=rtl] .md-typeset .admonition-title,[dir=rtl] .md-typeset summary{padding-left:.6rem;padding-right:2rem}[dir=ltr] .md-typeset .admonition-title,[dir=ltr] .md-typeset summary{border-left-width:.2rem}[dir=rtl] .md-typeset .admonition-title,[dir=rtl] .md-typeset summary{border-right-width:.2rem}[dir=ltr] .md-typeset .admonition-title,[dir=ltr] .md-typeset summary{border-top-left-radius:.1rem}[dir=ltr] .md-typeset .admonition-title,[dir=ltr] .md-typeset summary,[dir=rtl] .md-typeset .admonition-title,[dir=rtl] .md-typeset summary{border-top-right-radius:.1rem}[dir=rtl] .md-typeset .admonition-title,[dir=rtl] .md-typeset summary{border-top-left-radius:.1rem}.md-typeset .admonition-title,.md-typeset summary{background-color:#448aff1a;border:none;font-weight:700;margin:0 -.6rem;padding-bottom:.4rem;padding-top:.4rem;position:relative}html .md-typeset .admonition-title:last-child,html .md-typeset summary:last-child{margin-bottom:0}[dir=ltr] .md-typeset .admonition-title:before,[dir=ltr] .md-typeset summary:before{left:.6rem}[dir=rtl] .md-typeset .admonition-title:before,[dir=rtl] .md-typeset summary:before{right:.6rem}.md-typeset .admonition-title:before,.md-typeset summary:before{background-color:#448aff;content:"";height:1rem;-webkit-mask-image:var(--md-admonition-icon--note);mask-image:var(--md-admonition-icon--note);-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;position:absolute;top:.625em;width:1rem}.md-typeset .admonition-title code,.md-typeset summary code{box-shadow:0 0 0 .05rem var(--md-default-fg-color--lightest)}.md-typeset .admonition.note,.md-typeset details.note{border-color:#448aff}.md-typeset .admonition.note:focus-within,.md-typeset details.note:focus-within{box-shadow:0 0 0 .2rem #448aff1a}.md-typeset .note>.admonition-title,.md-typeset .note>summary{background-color:#448aff1a}.md-typeset .note>.admonition-title:before,.md-typeset .note>summary:before{background-color:#448aff;-webkit-mask-image:var(--md-admonition-icon--note);mask-image:var(--md-admonition-icon--note)}.md-typeset .note>.admonition-title:after,.md-typeset .note>summary:after{color:#448aff}.md-typeset .admonition.abstract,.md-typeset details.abstract{border-color:#00b0ff}.md-typeset .admonition.abstract:focus-within,.md-typeset details.abstract:focus-within{box-shadow:0 0 0 .2rem #00b0ff1a}.md-typeset .abstract>.admonition-title,.md-typeset .abstract>summary{background-color:#00b0ff1a}.md-typeset .abstract>.admonition-title:before,.md-typeset .abstract>summary:before{background-color:#00b0ff;-webkit-mask-image:var(--md-admonition-icon--abstract);mask-image:var(--md-admonition-icon--abstract)}.md-typeset .abstract>.admonition-title:after,.md-typeset .abstract>summary:after{color:#00b0ff}.md-typeset .admonition.info,.md-typeset details.info{border-color:#00b8d4}.md-typeset .admonition.info:focus-within,.md-typeset details.info:focus-within{box-shadow:0 0 0 .2rem #00b8d41a}.md-typeset .info>.admonition-title,.md-typeset .info>summary{background-color:#00b8d41a}.md-typeset .info>.admonition-title:before,.md-typeset .info>summary:before{background-color:#00b8d4;-webkit-mask-image:var(--md-admonition-icon--info);mask-image:var(--md-admonition-icon--info)}.md-typeset .info>.admonition-title:after,.md-typeset .info>summary:after{color:#00b8d4}.md-typeset .admonition.tip,.md-typeset details.tip{border-color:#00bfa5}.md-typeset .admonition.tip:focus-within,.md-typeset details.tip:focus-within{box-shadow:0 0 0 .2rem #00bfa51a}.md-typeset .tip>.admonition-title,.md-typeset .tip>summary{background-color:#00bfa51a}.md-typeset .tip>.admonition-title:before,.md-typeset .tip>summary:before{background-color:#00bfa5;-webkit-mask-image:var(--md-admonition-icon--tip);mask-image:var(--md-admonition-icon--tip)}.md-typeset .tip>.admonition-title:after,.md-typeset .tip>summary:after{color:#00bfa5}.md-typeset .admonition.success,.md-typeset details.success{border-color:#00c853}.md-typeset .admonition.success:focus-within,.md-typeset details.success:focus-within{box-shadow:0 0 0 .2rem #00c8531a}.md-typeset .success>.admonition-title,.md-typeset .success>summary{background-color:#00c8531a}.md-typeset .success>.admonition-title:before,.md-typeset .success>summary:before{background-color:#00c853;-webkit-mask-image:var(--md-admonition-icon--success);mask-image:var(--md-admonition-icon--success)}.md-typeset .success>.admonition-title:after,.md-typeset .success>summary:after{color:#00c853}.md-typeset .admonition.question,.md-typeset details.question{border-color:#64dd17}.md-typeset .admonition.question:focus-within,.md-typeset details.question:focus-within{box-shadow:0 0 0 .2rem #64dd171a}.md-typeset .question>.admonition-title,.md-typeset .question>summary{background-color:#64dd171a}.md-typeset .question>.admonition-title:before,.md-typeset .question>summary:before{background-color:#64dd17;-webkit-mask-image:var(--md-admonition-icon--question);mask-image:var(--md-admonition-icon--question)}.md-typeset .question>.admonition-title:after,.md-typeset .question>summary:after{color:#64dd17}.md-typeset .admonition.warning,.md-typeset details.warning{border-color:#ff9100}.md-typeset .admonition.warning:focus-within,.md-typeset details.warning:focus-within{box-shadow:0 0 0 .2rem #ff91001a}.md-typeset .warning>.admonition-title,.md-typeset .warning>summary{background-color:#ff91001a}.md-typeset .warning>.admonition-title:before,.md-typeset .warning>summary:before{background-color:#ff9100;-webkit-mask-image:var(--md-admonition-icon--warning);mask-image:var(--md-admonition-icon--warning)}.md-typeset .warning>.admonition-title:after,.md-typeset .warning>summary:after{color:#ff9100}.md-typeset .admonition.failure,.md-typeset details.failure{border-color:#ff5252}.md-typeset .admonition.failure:focus-within,.md-typeset details.failure:focus-within{box-shadow:0 0 0 .2rem #ff52521a}.md-typeset .failure>.admonition-title,.md-typeset .failure>summary{background-color:#ff52521a}.md-typeset .failure>.admonition-title:before,.md-typeset .failure>summary:before{background-color:#ff5252;-webkit-mask-image:var(--md-admonition-icon--failure);mask-image:var(--md-admonition-icon--failure)}.md-typeset .failure>.admonition-title:after,.md-typeset .failure>summary:after{color:#ff5252}.md-typeset .admonition.danger,.md-typeset details.danger{border-color:#ff1744}.md-typeset .admonition.danger:focus-within,.md-typeset details.danger:focus-within{box-shadow:0 0 0 .2rem #ff17441a}.md-typeset .danger>.admonition-title,.md-typeset .danger>summary{background-color:#ff17441a}.md-typeset .danger>.admonition-title:before,.md-typeset .danger>summary:before{background-color:#ff1744;-webkit-mask-image:var(--md-admonition-icon--danger);mask-image:var(--md-admonition-icon--danger)}.md-typeset .danger>.admonition-title:after,.md-typeset .danger>summary:after{color:#ff1744}.md-typeset .admonition.bug,.md-typeset details.bug{border-color:#f50057}.md-typeset .admonition.bug:focus-within,.md-typeset details.bug:focus-within{box-shadow:0 0 0 .2rem #f500571a}.md-typeset .bug>.admonition-title,.md-typeset .bug>summary{background-color:#f500571a}.md-typeset .bug>.admonition-title:before,.md-typeset .bug>summary:before{background-color:#f50057;-webkit-mask-image:var(--md-admonition-icon--bug);mask-image:var(--md-admonition-icon--bug)}.md-typeset .bug>.admonition-title:after,.md-typeset .bug>summary:after{color:#f50057}.md-typeset .admonition.example,.md-typeset details.example{border-color:#7c4dff}.md-typeset .admonition.example:focus-within,.md-typeset details.example:focus-within{box-shadow:0 0 0 .2rem #7c4dff1a}.md-typeset .example>.admonition-title,.md-typeset .example>summary{background-color:#7c4dff1a}.md-typeset .example>.admonition-title:before,.md-typeset .example>summary:before{background-color:#7c4dff;-webkit-mask-image:var(--md-admonition-icon--example);mask-image:var(--md-admonition-icon--example)}.md-typeset .example>.admonition-title:after,.md-typeset .example>summary:after{color:#7c4dff}.md-typeset .admonition.quote,.md-typeset details.quote{border-color:#9e9e9e}.md-typeset .admonition.quote:focus-within,.md-typeset details.quote:focus-within{box-shadow:0 0 0 .2rem #9e9e9e1a}.md-typeset .quote>.admonition-title,.md-typeset .quote>summary{background-color:#9e9e9e1a}.md-typeset .quote>.admonition-title:before,.md-typeset .quote>summary:before{background-color:#9e9e9e;-webkit-mask-image:var(--md-admonition-icon--quote);mask-image:var(--md-admonition-icon--quote)}.md-typeset .quote>.admonition-title:after,.md-typeset .quote>summary:after{color:#9e9e9e}:root{--md-footnotes-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 7v4H5.83l3.58-3.59L8 6l-6 6 6 6 1.41-1.42L5.83 13H21V7h-2Z"/></svg>')}.md-typeset .footnote{color:var(--md-default-fg-color--light);font-size:.64rem}[dir=ltr] .md-typeset .footnote>ol{margin-left:0}[dir=rtl] .md-typeset .footnote>ol{margin-right:0}.md-typeset .footnote>ol>li{transition:color 125ms}.md-typeset .footnote>ol>li:target{color:var(--md-default-fg-color)}.md-typeset .footnote>ol>li:focus-within .footnote-backref{opacity:1;transform:translateX(0);transition:none}.md-typeset .footnote>ol>li:hover .footnote-backref,.md-typeset .footnote>ol>li:target .footnote-backref{opacity:1;transform:translateX(0)}.md-typeset .footnote>ol>li>:first-child{margin-top:0}.md-typeset .footnote-ref{font-size:.75em;font-weight:700}html .md-typeset .footnote-ref{outline-offset:.1rem}.md-typeset [id^="fnref:"]:target>.footnote-ref{outline:auto}.md-typeset .footnote-backref{color:var(--md-typeset-a-color);display:inline-block;font-size:0;opacity:0;transform:translateX(.25rem);transition:color .25s,transform .25s .25s,opacity 125ms .25s;vertical-align:text-bottom}@media print{.md-typeset .footnote-backref{color:var(--md-typeset-a-color);opacity:1;transform:translateX(0)}}[dir=rtl] .md-typeset .footnote-backref{transform:translateX(-.25rem)}.md-typeset .footnote-backref:hover{color:var(--md-accent-fg-color)}.md-typeset .footnote-backref:before{background-color:currentcolor;content:"";display:inline-block;height:.8rem;-webkit-mask-image:var(--md-footnotes-icon);mask-image:var(--md-footnotes-icon);-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;width:.8rem}[dir=rtl] .md-typeset .footnote-backref:before svg{transform:scaleX(-1)}[dir=ltr] .md-typeset .headerlink{margin-left:.5rem}[dir=rtl] .md-typeset .headerlink{margin-right:.5rem}.md-typeset .headerlink{color:var(--md-default-fg-color--lighter);display:inline-block;opacity:0;transition:color .25s,opacity 125ms}@media print{.md-typeset .headerlink{display:none}}.md-typeset .headerlink:focus,.md-typeset :hover>.headerlink,.md-typeset :target>.headerlink{opacity:1;transition:color .25s,opacity 125ms}.md-typeset .headerlink:focus,.md-typeset .headerlink:hover,.md-typeset :target>.headerlink{color:var(--md-accent-fg-color)}.md-typeset :target{--md-scroll-margin:3.6rem;--md-scroll-offset:0rem;scroll-margin-top:calc(var(--md-scroll-margin) - var(--md-scroll-offset))}@media screen and (min-width:76.25em){.md-header--lifted~.md-container .md-typeset :target{--md-scroll-margin:6rem}}.md-typeset h1:target,.md-typeset h2:target,.md-typeset h3:target{--md-scroll-offset:0.2rem}.md-typeset h4:target{--md-scroll-offset:0.15rem}.md-typeset div.arithmatex{overflow:auto}@media screen and (max-width:44.984375em){.md-typeset div.arithmatex{margin:0 -.8rem}.md-typeset div.arithmatex>*{width:min-content}}.md-typeset div.arithmatex>*{margin-left:auto!important;margin-right:auto!important;padding:0 .8rem;touch-action:auto}.md-typeset div.arithmatex>* mjx-container{margin:0!important}.md-typeset div.arithmatex mjx-assistive-mml{height:0}.md-typeset del.critic{background-color:var(--md-typeset-del-color)}.md-typeset del.critic,.md-typeset ins.critic{-webkit-box-decoration-break:clone;box-decoration-break:clone}.md-typeset ins.critic{background-color:var(--md-typeset-ins-color)}.md-typeset .critic.comment{-webkit-box-decoration-break:clone;box-decoration-break:clone;color:var(--md-code-hl-comment-color)}.md-typeset .critic.comment:before{content:"/* "}.md-typeset .critic.comment:after{content:" */"}.md-typeset .critic.block{box-shadow:none;display:block;margin:1em 0;overflow:auto;padding-left:.8rem;padding-right:.8rem}.md-typeset .critic.block>:first-child{margin-top:.5em}.md-typeset .critic.block>:last-child{margin-bottom:.5em}:root{--md-details-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M8.59 16.58 13.17 12 8.59 7.41 10 6l6 6-6 6-1.41-1.42Z"/></svg>')}.md-typeset details{display:flow-root;overflow:visible;padding-top:0}.md-typeset details[open]>summary:after{transform:rotate(90deg)}.md-typeset details:not([open]){box-shadow:none;padding-bottom:0}.md-typeset details:not([open])>summary{border-radius:.1rem}[dir=ltr] .md-typeset summary{padding-right:1.8rem}[dir=rtl] .md-typeset summary{padding-left:1.8rem}[dir=ltr] .md-typeset summary{border-top-left-radius:.1rem}[dir=ltr] .md-typeset summary,[dir=rtl] .md-typeset summary{border-top-right-radius:.1rem}[dir=rtl] .md-typeset summary{border-top-left-radius:.1rem}.md-typeset summary{cursor:pointer;display:block;min-height:1rem;overflow:hidden}.md-typeset summary.focus-visible{outline-color:var(--md-accent-fg-color);outline-offset:.2rem}.md-typeset summary:not(.focus-visible){-webkit-tap-highlight-color:transparent;outline:none}[dir=ltr] .md-typeset summary:after{right:.4rem}[dir=rtl] .md-typeset summary:after{left:.4rem}.md-typeset summary:after{background-color:currentcolor;content:"";height:1rem;-webkit-mask-image:var(--md-details-icon);mask-image:var(--md-details-icon);-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;position:absolute;top:.625em;transform:rotate(0deg);transition:transform .25s;width:1rem}[dir=rtl] .md-typeset summary:after{transform:rotate(180deg)}.md-typeset summary::marker{display:none}.md-typeset summary::-webkit-details-marker{display:none}.md-typeset .emojione,.md-typeset .gemoji,.md-typeset .twemoji{--md-icon-size:1.125em;display:inline-flex;height:var(--md-icon-size);vertical-align:text-top}.md-typeset .emojione svg,.md-typeset .gemoji svg,.md-typeset .twemoji svg{fill:currentcolor;max-height:100%;width:var(--md-icon-size)}.md-typeset .lg,.md-typeset .xl,.md-typeset .xxl,.md-typeset .xxxl{vertical-align:text-bottom}.md-typeset .middle{vertical-align:middle}.md-typeset .lg{--md-icon-size:1.5em}.md-typeset .xl{--md-icon-size:2.25em}.md-typeset .xxl{--md-icon-size:3em}.md-typeset .xxxl{--md-icon-size:4em}.highlight .o,.highlight .ow{color:var(--md-code-hl-operator-color)}.highlight .p{color:var(--md-code-hl-punctuation-color)}.highlight .cpf,.highlight .l,.highlight .s,.highlight .s1,.highlight .s2,.highlight .sb,.highlight .sc,.highlight .si,.highlight .ss{color:var(--md-code-hl-string-color)}.highlight .cp,.highlight .se,.highlight .sh,.highlight .sr,.highlight .sx{color:var(--md-code-hl-special-color)}.highlight .il,.highlight .m,.highlight .mb,.highlight .mf,.highlight .mh,.highlight .mi,.highlight .mo{color:var(--md-code-hl-number-color)}.highlight .k,.highlight .kd,.highlight .kn,.highlight .kp,.highlight .kr,.highlight .kt{color:var(--md-code-hl-keyword-color)}.highlight .kc,.highlight .n{color:var(--md-code-hl-name-color)}.highlight .bp,.highlight .nb,.highlight .no{color:var(--md-code-hl-constant-color)}.highlight .nc,.highlight .ne,.highlight .nf,.highlight .nn{color:var(--md-code-hl-function-color)}.highlight .nd,.highlight .ni,.highlight .nl,.highlight .nt{color:var(--md-code-hl-keyword-color)}.highlight .c,.highlight .c1,.highlight .ch,.highlight .cm,.highlight .cs,.highlight .sd{color:var(--md-code-hl-comment-color)}.highlight .na,.highlight .nv,.highlight .vc,.highlight .vg,.highlight .vi{color:var(--md-code-hl-variable-color)}.highlight .ge,.highlight .gh,.highlight .go,.highlight .gp,.highlight .gr,.highlight .gs,.highlight .gt,.highlight .gu{color:var(--md-code-hl-generic-color)}.highlight .gd,.highlight .gi{border-radius:.1rem;margin:0 -.125em;padding:0 .125em}.highlight .gd{background-color:var(--md-typeset-del-color)}.highlight .gi{background-color:var(--md-typeset-ins-color)}.highlight .hll{background-color:var(--md-code-hl-color--light);box-shadow:2px 0 0 0 var(--md-code-hl-color) inset;display:block;margin:0 -1.1764705882em;padding:0 1.1764705882em}.highlight span.filename{background-color:var(--md-code-bg-color);border-bottom:.05rem solid var(--md-default-fg-color--lightest);border-top-left-radius:.1rem;border-top-right-radius:.1rem;display:flow-root;font-size:.85em;font-weight:700;margin-top:1em;padding:.6617647059em 1.1764705882em;position:relative}.highlight span.filename+pre{margin-top:0}.highlight span.filename+pre>code{border-top-left-radius:0;border-top-right-radius:0}.highlight [data-linenos]:before{background-color:var(--md-code-bg-color);box-shadow:-.05rem 0 var(--md-default-fg-color--lightest) inset;color:var(--md-default-fg-color--light);content:attr(data-linenos);float:left;left:-1.1764705882em;margin-left:-1.1764705882em;margin-right:1.1764705882em;padding-left:1.1764705882em;position:sticky;-webkit-user-select:none;user-select:none;z-index:3}.highlight code a[id]{position:absolute;visibility:hidden}.highlight code[data-md-copying]{display:initial}.highlight code[data-md-copying] .hll{display:contents}.highlight code[data-md-copying] .md-annotation{display:none}.highlighttable{display:flow-root}.highlighttable tbody,.highlighttable td{display:block;padding:0}.highlighttable tr{display:flex}.highlighttable pre{margin:0}.highlighttable th.filename{flex-grow:1;padding:0;text-align:left}.highlighttable th.filename span.filename{margin-top:0}.highlighttable .linenos{background-color:var(--md-code-bg-color);border-bottom-left-radius:.1rem;border-top-left-radius:.1rem;font-size:.85em;padding:.7720588235em 0 .7720588235em 1.1764705882em;-webkit-user-select:none;user-select:none}.highlighttable .linenodiv{box-shadow:-.05rem 0 var(--md-default-fg-color--lightest) inset;padding-right:.5882352941em}.highlighttable .linenodiv pre{color:var(--md-default-fg-color--light);text-align:right}.highlighttable .code{flex:1;min-width:0}.linenodiv a{color:inherit}.md-typeset .highlighttable{direction:ltr;margin:1em 0}.md-typeset .highlighttable>tbody>tr>.code>div>pre>code{border-bottom-left-radius:0;border-top-left-radius:0}.md-typeset .highlight+.result{border:.05rem solid var(--md-code-bg-color);border-bottom-left-radius:.1rem;border-bottom-right-radius:.1rem;border-top-width:.1rem;margin-top:-1.125em;overflow:visible;padding:0 1em}.md-typeset .highlight+.result:after{clear:both;content:"";display:block}@media screen and (max-width:44.984375em){.md-content__inner>.highlight{margin:1em -.8rem}.md-content__inner>.highlight>.filename,.md-content__inner>.highlight>.highlighttable>tbody>tr>.code>div>pre>code,.md-content__inner>.highlight>.highlighttable>tbody>tr>.filename span.filename,.md-content__inner>.highlight>.highlighttable>tbody>tr>.linenos,.md-content__inner>.highlight>pre>code{border-radius:0}.md-content__inner>.highlight+.result{border-left-width:0;border-radius:0;border-right-width:0;margin-left:-.8rem;margin-right:-.8rem}}.md-typeset .keys kbd:after,.md-typeset .keys kbd:before{-moz-osx-font-smoothing:initial;-webkit-font-smoothing:initial;color:inherit;margin:0;position:relative}.md-typeset .keys span{color:var(--md-default-fg-color--light);padding:0 .2em}.md-typeset .keys .key-alt:before,.md-typeset .keys .key-left-alt:before,.md-typeset .keys .key-right-alt:before{content:"⎇";padding-right:.4em}.md-typeset .keys .key-command:before,.md-typeset .keys .key-left-command:before,.md-typeset .keys .key-right-command:before{content:"⌘";padding-right:.4em}.md-typeset .keys .key-control:before,.md-typeset .keys .key-left-control:before,.md-typeset .keys .key-right-control:before{content:"⌃";padding-right:.4em}.md-typeset .keys .key-left-meta:before,.md-typeset .keys .key-meta:before,.md-typeset .keys .key-right-meta:before{content:"◆";padding-right:.4em}.md-typeset .keys .key-left-option:before,.md-typeset .keys .key-option:before,.md-typeset .keys .key-right-option:before{content:"⌥";padding-right:.4em}.md-typeset .keys .key-left-shift:before,.md-typeset .keys .key-right-shift:before,.md-typeset .keys .key-shift:before{content:"⇧";padding-right:.4em}.md-typeset .keys .key-left-super:before,.md-typeset .keys .key-right-super:before,.md-typeset .keys .key-super:before{content:"❖";padding-right:.4em}.md-typeset .keys .key-left-windows:before,.md-typeset .keys .key-right-windows:before,.md-typeset .keys .key-windows:before{content:"⊞";padding-right:.4em}.md-typeset .keys .key-arrow-down:before{content:"↓";padding-right:.4em}.md-typeset .keys .key-arrow-left:before{content:"←";padding-right:.4em}.md-typeset .keys .key-arrow-right:before{content:"→";padding-right:.4em}.md-typeset .keys .key-arrow-up:before{content:"↑";padding-right:.4em}.md-typeset .keys .key-backspace:before{content:"⌫";padding-right:.4em}.md-typeset .keys .key-backtab:before{content:"⇤";padding-right:.4em}.md-typeset .keys .key-caps-lock:before{content:"⇪";padding-right:.4em}.md-typeset .keys .key-clear:before{content:"⌧";padding-right:.4em}.md-typeset .keys .key-context-menu:before{content:"☰";padding-right:.4em}.md-typeset .keys .key-delete:before{content:"⌦";padding-right:.4em}.md-typeset .keys .key-eject:before{content:"⏏";padding-right:.4em}.md-typeset .keys .key-end:before{content:"⤓";padding-right:.4em}.md-typeset .keys .key-escape:before{content:"⎋";padding-right:.4em}.md-typeset .keys .key-home:before{content:"⤒";padding-right:.4em}.md-typeset .keys .key-insert:before{content:"⎀";padding-right:.4em}.md-typeset .keys .key-page-down:before{content:"⇟";padding-right:.4em}.md-typeset .keys .key-page-up:before{content:"⇞";padding-right:.4em}.md-typeset .keys .key-print-screen:before{content:"⎙";padding-right:.4em}.md-typeset .keys .key-tab:after{content:"⇥";padding-left:.4em}.md-typeset .keys .key-num-enter:after{content:"⌤";padding-left:.4em}.md-typeset .keys .key-enter:after{content:"⏎";padding-left:.4em}:root{--md-tabbed-icon--prev:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M15.41 16.58 10.83 12l4.58-4.59L14 6l-6 6 6 6 1.41-1.42Z"/></svg>');--md-tabbed-icon--next:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M8.59 16.58 13.17 12 8.59 7.41 10 6l6 6-6 6-1.41-1.42Z"/></svg>')}.md-typeset .tabbed-set{border-radius:.1rem;display:flex;flex-flow:column wrap;margin:1em 0;position:relative}.md-typeset .tabbed-set>input{height:0;opacity:0;position:absolute;width:0}.md-typeset .tabbed-set>input:target{--md-scroll-offset:0.625em}.md-typeset .tabbed-set>input.focus-visible~.tabbed-labels:before{background-color:var(--md-accent-fg-color)}.md-typeset .tabbed-labels{-ms-overflow-style:none;box-shadow:0 -.05rem var(--md-default-fg-color--lightest) inset;display:flex;max-width:100%;overflow:auto;scrollbar-width:none}@media print{.md-typeset .tabbed-labels{display:contents}}@media screen{.js .md-typeset .tabbed-labels{position:relative}.js .md-typeset .tabbed-labels:before{background:var(--md-default-fg-color);bottom:0;content:"";display:block;height:2px;left:0;position:absolute;transform:translateX(var(--md-indicator-x));transition:width 225ms,background-color .25s,transform .25s;transition-timing-function:cubic-bezier(.4,0,.2,1);width:var(--md-indicator-width)}}.md-typeset .tabbed-labels::-webkit-scrollbar{display:none}.md-typeset .tabbed-labels>label{border-bottom:.1rem solid #0000;border-radius:.1rem .1rem 0 0;color:var(--md-default-fg-color--light);cursor:pointer;flex-shrink:0;font-size:.64rem;font-weight:700;padding:.78125em 1.25em .625em;scroll-margin-inline-start:1rem;transition:background-color .25s,color .25s;white-space:nowrap;width:auto}@media print{.md-typeset .tabbed-labels>label:first-child{order:1}.md-typeset .tabbed-labels>label:nth-child(2){order:2}.md-typeset .tabbed-labels>label:nth-child(3){order:3}.md-typeset .tabbed-labels>label:nth-child(4){order:4}.md-typeset .tabbed-labels>label:nth-child(5){order:5}.md-typeset .tabbed-labels>label:nth-child(6){order:6}.md-typeset .tabbed-labels>label:nth-child(7){order:7}.md-typeset .tabbed-labels>label:nth-child(8){order:8}.md-typeset .tabbed-labels>label:nth-child(9){order:9}.md-typeset .tabbed-labels>label:nth-child(10){order:10}.md-typeset .tabbed-labels>label:nth-child(11){order:11}.md-typeset .tabbed-labels>label:nth-child(12){order:12}.md-typeset .tabbed-labels>label:nth-child(13){order:13}.md-typeset .tabbed-labels>label:nth-child(14){order:14}.md-typeset .tabbed-labels>label:nth-child(15){order:15}.md-typeset .tabbed-labels>label:nth-child(16){order:16}.md-typeset .tabbed-labels>label:nth-child(17){order:17}.md-typeset .tabbed-labels>label:nth-child(18){order:18}.md-typeset .tabbed-labels>label:nth-child(19){order:19}.md-typeset .tabbed-labels>label:nth-child(20){order:20}}.md-typeset .tabbed-labels>label:hover{color:var(--md-default-fg-color)}.md-typeset .tabbed-labels>label>[href]:first-child{color:inherit}.md-typeset .tabbed-labels--linked>label{padding:0}.md-typeset .tabbed-labels--linked>label>a{display:block;padding:.78125em 1.25em .625em}.md-typeset .tabbed-content{width:100%}@media print{.md-typeset .tabbed-content{display:contents}}.md-typeset .tabbed-block{display:none}@media print{.md-typeset .tabbed-block{display:block}.md-typeset .tabbed-block:first-child{order:1}.md-typeset .tabbed-block:nth-child(2){order:2}.md-typeset .tabbed-block:nth-child(3){order:3}.md-typeset .tabbed-block:nth-child(4){order:4}.md-typeset .tabbed-block:nth-child(5){order:5}.md-typeset .tabbed-block:nth-child(6){order:6}.md-typeset .tabbed-block:nth-child(7){order:7}.md-typeset .tabbed-block:nth-child(8){order:8}.md-typeset .tabbed-block:nth-child(9){order:9}.md-typeset .tabbed-block:nth-child(10){order:10}.md-typeset .tabbed-block:nth-child(11){order:11}.md-typeset .tabbed-block:nth-child(12){order:12}.md-typeset .tabbed-block:nth-child(13){order:13}.md-typeset .tabbed-block:nth-child(14){order:14}.md-typeset .tabbed-block:nth-child(15){order:15}.md-typeset .tabbed-block:nth-child(16){order:16}.md-typeset .tabbed-block:nth-child(17){order:17}.md-typeset .tabbed-block:nth-child(18){order:18}.md-typeset .tabbed-block:nth-child(19){order:19}.md-typeset .tabbed-block:nth-child(20){order:20}}.md-typeset .tabbed-block>.highlight:first-child>pre,.md-typeset .tabbed-block>pre:first-child{margin:0}.md-typeset .tabbed-block>.highlight:first-child>pre>code,.md-typeset .tabbed-block>pre:first-child>code{border-top-left-radius:0;border-top-right-radius:0}.md-typeset .tabbed-block>.highlight:first-child>.filename{border-top-left-radius:0;border-top-right-radius:0;margin:0}.md-typeset .tabbed-block>.highlight:first-child>.highlighttable{margin:0}.md-typeset .tabbed-block>.highlight:first-child>.highlighttable>tbody>tr>.filename span.filename,.md-typeset .tabbed-block>.highlight:first-child>.highlighttable>tbody>tr>.linenos{border-top-left-radius:0;border-top-right-radius:0;margin:0}.md-typeset .tabbed-block>.highlight:first-child>.highlighttable>tbody>tr>.code>div>pre>code{border-top-left-radius:0;border-top-right-radius:0}.md-typeset .tabbed-block>.highlight:first-child+.result{margin-top:-.125em}.md-typeset .tabbed-block>.tabbed-set{margin:0}.md-typeset .tabbed-button{align-self:center;border-radius:100%;color:var(--md-default-fg-color--light);cursor:pointer;display:block;height:.9rem;margin-top:.1rem;pointer-events:auto;transition:background-color .25s;width:.9rem}.md-typeset .tabbed-button:hover{background-color:var(--md-accent-fg-color--transparent);color:var(--md-accent-fg-color)}.md-typeset .tabbed-button:after{background-color:currentcolor;content:"";display:block;height:100%;-webkit-mask-image:var(--md-tabbed-icon--prev);mask-image:var(--md-tabbed-icon--prev);-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;transition:background-color .25s,transform .25s;width:100%}.md-typeset .tabbed-control{background:linear-gradient(to right,var(--md-default-bg-color) 60%,#0000);display:flex;height:1.9rem;justify-content:start;pointer-events:none;position:absolute;transition:opacity 125ms;width:1.2rem}[dir=rtl] .md-typeset .tabbed-control{transform:rotate(180deg)}.md-typeset .tabbed-control[hidden]{opacity:0}.md-typeset .tabbed-control--next{background:linear-gradient(to left,var(--md-default-bg-color) 60%,#0000);justify-content:end;right:0}.md-typeset .tabbed-control--next .tabbed-button:after{-webkit-mask-image:var(--md-tabbed-icon--next);mask-image:var(--md-tabbed-icon--next)}@media screen and (max-width:44.984375em){[dir=ltr] .md-content__inner>.tabbed-set .tabbed-labels{padding-left:.8rem}[dir=rtl] .md-content__inner>.tabbed-set .tabbed-labels{padding-right:.8rem}.md-content__inner>.tabbed-set .tabbed-labels{margin:0 -.8rem;max-width:100vw;scroll-padding-inline-start:.8rem}[dir=ltr] .md-content__inner>.tabbed-set .tabbed-labels:after{padding-right:.8rem}[dir=rtl] .md-content__inner>.tabbed-set .tabbed-labels:after{padding-left:.8rem}.md-content__inner>.tabbed-set .tabbed-labels:after{content:""}[dir=ltr] .md-content__inner>.tabbed-set .tabbed-labels~.tabbed-control--prev{padding-left:.8rem}[dir=rtl] .md-content__inner>.tabbed-set .tabbed-labels~.tabbed-control--prev{padding-right:.8rem}[dir=ltr] .md-content__inner>.tabbed-set .tabbed-labels~.tabbed-control--prev{margin-left:-.8rem}[dir=rtl] .md-content__inner>.tabbed-set .tabbed-labels~.tabbed-control--prev{margin-right:-.8rem}.md-content__inner>.tabbed-set .tabbed-labels~.tabbed-control--prev{width:2rem}[dir=ltr] .md-content__inner>.tabbed-set .tabbed-labels~.tabbed-control--next{padding-right:.8rem}[dir=rtl] .md-content__inner>.tabbed-set .tabbed-labels~.tabbed-control--next{padding-left:.8rem}[dir=ltr] .md-content__inner>.tabbed-set .tabbed-labels~.tabbed-control--next{margin-right:-.8rem}[dir=rtl] .md-content__inner>.tabbed-set .tabbed-labels~.tabbed-control--next{margin-left:-.8rem}.md-content__inner>.tabbed-set .tabbed-labels~.tabbed-control--next{width:2rem}}@media screen{.md-typeset .tabbed-set>input:first-child:checked~.tabbed-labels>:first-child,.md-typeset .tabbed-set>input:nth-child(10):checked~.tabbed-labels>:nth-child(10),.md-typeset .tabbed-set>input:nth-child(11):checked~.tabbed-labels>:nth-child(11),.md-typeset .tabbed-set>input:nth-child(12):checked~.tabbed-labels>:nth-child(12),.md-typeset .tabbed-set>input:nth-child(13):checked~.tabbed-labels>:nth-child(13),.md-typeset .tabbed-set>input:nth-child(14):checked~.tabbed-labels>:nth-child(14),.md-typeset .tabbed-set>input:nth-child(15):checked~.tabbed-labels>:nth-child(15),.md-typeset .tabbed-set>input:nth-child(16):checked~.tabbed-labels>:nth-child(16),.md-typeset .tabbed-set>input:nth-child(17):checked~.tabbed-labels>:nth-child(17),.md-typeset .tabbed-set>input:nth-child(18):checked~.tabbed-labels>:nth-child(18),.md-typeset .tabbed-set>input:nth-child(19):checked~.tabbed-labels>:nth-child(19),.md-typeset .tabbed-set>input:nth-child(2):checked~.tabbed-labels>:nth-child(2),.md-typeset .tabbed-set>input:nth-child(20):checked~.tabbed-labels>:nth-child(20),.md-typeset .tabbed-set>input:nth-child(3):checked~.tabbed-labels>:nth-child(3),.md-typeset .tabbed-set>input:nth-child(4):checked~.tabbed-labels>:nth-child(4),.md-typeset .tabbed-set>input:nth-child(5):checked~.tabbed-labels>:nth-child(5),.md-typeset .tabbed-set>input:nth-child(6):checked~.tabbed-labels>:nth-child(6),.md-typeset .tabbed-set>input:nth-child(7):checked~.tabbed-labels>:nth-child(7),.md-typeset .tabbed-set>input:nth-child(8):checked~.tabbed-labels>:nth-child(8),.md-typeset .tabbed-set>input:nth-child(9):checked~.tabbed-labels>:nth-child(9){color:var(--md-default-fg-color)}.md-typeset .no-js .tabbed-set>input:first-child:checked~.tabbed-labels>:first-child,.md-typeset .no-js .tabbed-set>input:nth-child(10):checked~.tabbed-labels>:nth-child(10),.md-typeset .no-js .tabbed-set>input:nth-child(11):checked~.tabbed-labels>:nth-child(11),.md-typeset .no-js .tabbed-set>input:nth-child(12):checked~.tabbed-labels>:nth-child(12),.md-typeset .no-js .tabbed-set>input:nth-child(13):checked~.tabbed-labels>:nth-child(13),.md-typeset .no-js .tabbed-set>input:nth-child(14):checked~.tabbed-labels>:nth-child(14),.md-typeset .no-js .tabbed-set>input:nth-child(15):checked~.tabbed-labels>:nth-child(15),.md-typeset .no-js .tabbed-set>input:nth-child(16):checked~.tabbed-labels>:nth-child(16),.md-typeset .no-js .tabbed-set>input:nth-child(17):checked~.tabbed-labels>:nth-child(17),.md-typeset .no-js .tabbed-set>input:nth-child(18):checked~.tabbed-labels>:nth-child(18),.md-typeset .no-js .tabbed-set>input:nth-child(19):checked~.tabbed-labels>:nth-child(19),.md-typeset .no-js .tabbed-set>input:nth-child(2):checked~.tabbed-labels>:nth-child(2),.md-typeset .no-js .tabbed-set>input:nth-child(20):checked~.tabbed-labels>:nth-child(20),.md-typeset .no-js .tabbed-set>input:nth-child(3):checked~.tabbed-labels>:nth-child(3),.md-typeset .no-js .tabbed-set>input:nth-child(4):checked~.tabbed-labels>:nth-child(4),.md-typeset .no-js .tabbed-set>input:nth-child(5):checked~.tabbed-labels>:nth-child(5),.md-typeset .no-js .tabbed-set>input:nth-child(6):checked~.tabbed-labels>:nth-child(6),.md-typeset .no-js .tabbed-set>input:nth-child(7):checked~.tabbed-labels>:nth-child(7),.md-typeset .no-js .tabbed-set>input:nth-child(8):checked~.tabbed-labels>:nth-child(8),.md-typeset .no-js .tabbed-set>input:nth-child(9):checked~.tabbed-labels>:nth-child(9),.no-js .md-typeset .tabbed-set>input:first-child:checked~.tabbed-labels>:first-child,.no-js .md-typeset .tabbed-set>input:nth-child(10):checked~.tabbed-labels>:nth-child(10),.no-js .md-typeset .tabbed-set>input:nth-child(11):checked~.tabbed-labels>:nth-child(11),.no-js .md-typeset .tabbed-set>input:nth-child(12):checked~.tabbed-labels>:nth-child(12),.no-js .md-typeset .tabbed-set>input:nth-child(13):checked~.tabbed-labels>:nth-child(13),.no-js .md-typeset .tabbed-set>input:nth-child(14):checked~.tabbed-labels>:nth-child(14),.no-js .md-typeset .tabbed-set>input:nth-child(15):checked~.tabbed-labels>:nth-child(15),.no-js .md-typeset .tabbed-set>input:nth-child(16):checked~.tabbed-labels>:nth-child(16),.no-js .md-typeset .tabbed-set>input:nth-child(17):checked~.tabbed-labels>:nth-child(17),.no-js .md-typeset .tabbed-set>input:nth-child(18):checked~.tabbed-labels>:nth-child(18),.no-js .md-typeset .tabbed-set>input:nth-child(19):checked~.tabbed-labels>:nth-child(19),.no-js .md-typeset .tabbed-set>input:nth-child(2):checked~.tabbed-labels>:nth-child(2),.no-js .md-typeset .tabbed-set>input:nth-child(20):checked~.tabbed-labels>:nth-child(20),.no-js .md-typeset .tabbed-set>input:nth-child(3):checked~.tabbed-labels>:nth-child(3),.no-js .md-typeset .tabbed-set>input:nth-child(4):checked~.tabbed-labels>:nth-child(4),.no-js .md-typeset .tabbed-set>input:nth-child(5):checked~.tabbed-labels>:nth-child(5),.no-js .md-typeset .tabbed-set>input:nth-child(6):checked~.tabbed-labels>:nth-child(6),.no-js .md-typeset .tabbed-set>input:nth-child(7):checked~.tabbed-labels>:nth-child(7),.no-js .md-typeset .tabbed-set>input:nth-child(8):checked~.tabbed-labels>:nth-child(8),.no-js .md-typeset .tabbed-set>input:nth-child(9):checked~.tabbed-labels>:nth-child(9){border-color:var(--md-default-fg-color)}}.md-typeset .tabbed-set>input:first-child.focus-visible~.tabbed-labels>:first-child,.md-typeset .tabbed-set>input:nth-child(10).focus-visible~.tabbed-labels>:nth-child(10),.md-typeset .tabbed-set>input:nth-child(11).focus-visible~.tabbed-labels>:nth-child(11),.md-typeset .tabbed-set>input:nth-child(12).focus-visible~.tabbed-labels>:nth-child(12),.md-typeset .tabbed-set>input:nth-child(13).focus-visible~.tabbed-labels>:nth-child(13),.md-typeset .tabbed-set>input:nth-child(14).focus-visible~.tabbed-labels>:nth-child(14),.md-typeset .tabbed-set>input:nth-child(15).focus-visible~.tabbed-labels>:nth-child(15),.md-typeset .tabbed-set>input:nth-child(16).focus-visible~.tabbed-labels>:nth-child(16),.md-typeset .tabbed-set>input:nth-child(17).focus-visible~.tabbed-labels>:nth-child(17),.md-typeset .tabbed-set>input:nth-child(18).focus-visible~.tabbed-labels>:nth-child(18),.md-typeset .tabbed-set>input:nth-child(19).focus-visible~.tabbed-labels>:nth-child(19),.md-typeset .tabbed-set>input:nth-child(2).focus-visible~.tabbed-labels>:nth-child(2),.md-typeset .tabbed-set>input:nth-child(20).focus-visible~.tabbed-labels>:nth-child(20),.md-typeset .tabbed-set>input:nth-child(3).focus-visible~.tabbed-labels>:nth-child(3),.md-typeset .tabbed-set>input:nth-child(4).focus-visible~.tabbed-labels>:nth-child(4),.md-typeset .tabbed-set>input:nth-child(5).focus-visible~.tabbed-labels>:nth-child(5),.md-typeset .tabbed-set>input:nth-child(6).focus-visible~.tabbed-labels>:nth-child(6),.md-typeset .tabbed-set>input:nth-child(7).focus-visible~.tabbed-labels>:nth-child(7),.md-typeset .tabbed-set>input:nth-child(8).focus-visible~.tabbed-labels>:nth-child(8),.md-typeset .tabbed-set>input:nth-child(9).focus-visible~.tabbed-labels>:nth-child(9){color:var(--md-accent-fg-color)}.md-typeset .tabbed-set>input:first-child:checked~.tabbed-content>:first-child,.md-typeset .tabbed-set>input:nth-child(10):checked~.tabbed-content>:nth-child(10),.md-typeset .tabbed-set>input:nth-child(11):checked~.tabbed-content>:nth-child(11),.md-typeset .tabbed-set>input:nth-child(12):checked~.tabbed-content>:nth-child(12),.md-typeset .tabbed-set>input:nth-child(13):checked~.tabbed-content>:nth-child(13),.md-typeset .tabbed-set>input:nth-child(14):checked~.tabbed-content>:nth-child(14),.md-typeset .tabbed-set>input:nth-child(15):checked~.tabbed-content>:nth-child(15),.md-typeset .tabbed-set>input:nth-child(16):checked~.tabbed-content>:nth-child(16),.md-typeset .tabbed-set>input:nth-child(17):checked~.tabbed-content>:nth-child(17),.md-typeset .tabbed-set>input:nth-child(18):checked~.tabbed-content>:nth-child(18),.md-typeset .tabbed-set>input:nth-child(19):checked~.tabbed-content>:nth-child(19),.md-typeset .tabbed-set>input:nth-child(2):checked~.tabbed-content>:nth-child(2),.md-typeset .tabbed-set>input:nth-child(20):checked~.tabbed-content>:nth-child(20),.md-typeset .tabbed-set>input:nth-child(3):checked~.tabbed-content>:nth-child(3),.md-typeset .tabbed-set>input:nth-child(4):checked~.tabbed-content>:nth-child(4),.md-typeset .tabbed-set>input:nth-child(5):checked~.tabbed-content>:nth-child(5),.md-typeset .tabbed-set>input:nth-child(6):checked~.tabbed-content>:nth-child(6),.md-typeset .tabbed-set>input:nth-child(7):checked~.tabbed-content>:nth-child(7),.md-typeset .tabbed-set>input:nth-child(8):checked~.tabbed-content>:nth-child(8),.md-typeset .tabbed-set>input:nth-child(9):checked~.tabbed-content>:nth-child(9){display:block}:root{--md-tasklist-icon:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M1 12C1 5.925 5.925 1 12 1s11 4.925 11 11-4.925 11-11 11S1 18.075 1 12Zm16.28-2.72a.751.751 0 0 0-.018-1.042.751.751 0 0 0-1.042-.018l-5.97 5.97-2.47-2.47a.751.751 0 0 0-1.042.018.751.751 0 0 0-.018 1.042l3 3a.75.75 0 0 0 1.06 0Z"/></svg>');--md-tasklist-icon--checked:url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M1 12C1 5.925 5.925 1 12 1s11 4.925 11 11-4.925 11-11 11S1 18.075 1 12Zm16.28-2.72a.751.751 0 0 0-.018-1.042.751.751 0 0 0-1.042-.018l-5.97 5.97-2.47-2.47a.751.751 0 0 0-1.042.018.751.751 0 0 0-.018 1.042l3 3a.75.75 0 0 0 1.06 0Z"/></svg>')}.md-typeset .task-list-item{list-style-type:none;position:relative}[dir=ltr] .md-typeset .task-list-item [type=checkbox]{left:-2em}[dir=rtl] .md-typeset .task-list-item [type=checkbox]{right:-2em}.md-typeset .task-list-item [type=checkbox]{position:absolute;top:.45em}.md-typeset .task-list-control [type=checkbox]{opacity:0;z-index:-1}[dir=ltr] .md-typeset .task-list-indicator:before{left:-1.5em}[dir=rtl] .md-typeset .task-list-indicator:before{right:-1.5em}.md-typeset .task-list-indicator:before{background-color:var(--md-default-fg-color--lightest);content:"";height:1.25em;-webkit-mask-image:var(--md-tasklist-icon);mask-image:var(--md-tasklist-icon);-webkit-mask-position:center;mask-position:center;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;position:absolute;top:.15em;width:1.25em}.md-typeset [type=checkbox]:checked+.task-list-indicator:before{background-color:#00e676;-webkit-mask-image:var(--md-tasklist-icon--checked);mask-image:var(--md-tasklist-icon--checked)}:root>*{--md-mermaid-font-family:var(--md-text-font-family),sans-serif;--md-mermaid-edge-color:var(--md-code-fg-color);--md-mermaid-node-bg-color:var(--md-accent-fg-color--transparent);--md-mermaid-node-fg-color:var(--md-accent-fg-color);--md-mermaid-label-bg-color:var(--md-default-bg-color);--md-mermaid-label-fg-color:var(--md-code-fg-color);--md-mermaid-sequence-actor-bg-color:var(--md-mermaid-label-bg-color);--md-mermaid-sequence-actor-fg-color:var(--md-mermaid-label-fg-color);--md-mermaid-sequence-actor-border-color:var(--md-mermaid-node-fg-color);--md-mermaid-sequence-actor-line-color:var(--md-default-fg-color--lighter);--md-mermaid-sequence-actorman-bg-color:var(--md-mermaid-label-bg-color);--md-mermaid-sequence-actorman-line-color:var(--md-mermaid-node-fg-color);--md-mermaid-sequence-box-bg-color:var(--md-mermaid-node-bg-color);--md-mermaid-sequence-box-fg-color:var(--md-mermaid-edge-color);--md-mermaid-sequence-label-bg-color:var(--md-mermaid-node-bg-color);--md-mermaid-sequence-label-fg-color:var(--md-mermaid-node-fg-color);--md-mermaid-sequence-loop-bg-color:var(--md-mermaid-node-bg-color);--md-mermaid-sequence-loop-fg-color:var(--md-mermaid-edge-color);--md-mermaid-sequence-loop-border-color:var(--md-mermaid-node-fg-color);--md-mermaid-sequence-message-fg-color:var(--md-mermaid-edge-color);--md-mermaid-sequence-message-line-color:var(--md-mermaid-edge-color);--md-mermaid-sequence-note-bg-color:var(--md-mermaid-label-bg-color);--md-mermaid-sequence-note-fg-color:var(--md-mermaid-edge-color);--md-mermaid-sequence-note-border-color:var(--md-mermaid-label-fg-color);--md-mermaid-sequence-number-bg-color:var(--md-mermaid-node-fg-color);--md-mermaid-sequence-number-fg-color:var(--md-accent-bg-color)}.mermaid{line-height:normal;margin:1em 0}.md-typeset .grid{grid-gap:.4rem;display:grid;grid-template-columns:repeat(auto-fit,minmax(min(100%,16rem),1fr));margin:1em 0}.md-typeset .grid.cards>ol,.md-typeset .grid.cards>ul{display:contents}.md-typeset .grid.cards>ol>li,.md-typeset .grid.cards>ul>li,.md-typeset .grid>.card{border:.05rem solid var(--md-default-fg-color--lightest);border-radius:.1rem;display:block;margin:0;padding:.8rem;transition:border .25s,box-shadow .25s}.md-typeset .grid.cards>ol>li:focus-within,.md-typeset .grid.cards>ol>li:hover,.md-typeset .grid.cards>ul>li:focus-within,.md-typeset .grid.cards>ul>li:hover,.md-typeset .grid>.card:focus-within,.md-typeset .grid>.card:hover{border-color:#0000;box-shadow:var(--md-shadow-z2)}.md-typeset .grid.cards>ol>li>hr,.md-typeset .grid.cards>ul>li>hr,.md-typeset .grid>.card>hr{margin-bottom:1em;margin-top:1em}.md-typeset .grid.cards>ol>li>:first-child,.md-typeset .grid.cards>ul>li>:first-child,.md-typeset .grid>.card>:first-child{margin-top:0}.md-typeset .grid.cards>ol>li>:last-child,.md-typeset .grid.cards>ul>li>:last-child,.md-typeset .grid>.card>:last-child{margin-bottom:0}.md-typeset .grid>*,.md-typeset .grid>.admonition,.md-typeset .grid>.highlight>*,.md-typeset .grid>.highlighttable,.md-typeset .grid>.md-typeset details,.md-typeset .grid>details,.md-typeset .grid>pre{margin-bottom:0;margin-top:0}.md-typeset .grid>.highlight>pre:only-child,.md-typeset .grid>.highlight>pre>code,.md-typeset .grid>.highlighttable,.md-typeset .grid>.highlighttable>tbody,.md-typeset .grid>.highlighttable>tbody>tr,.md-typeset .grid>.highlighttable>tbody>tr>.code,.md-typeset .grid>.highlighttable>tbody>tr>.code>.highlight,.md-typeset .grid>.highlighttable>tbody>tr>.code>.highlight>pre,.md-typeset .grid>.highlighttable>tbody>tr>.code>.highlight>pre>code{height:100%}.md-typeset .grid>.tabbed-set{margin-bottom:0;margin-top:0}@media screen and (min-width:45em){[dir=ltr] .md-typeset .inline{float:left}[dir=rtl] .md-typeset .inline{float:right}[dir=ltr] .md-typeset .inline{margin-right:.8rem}[dir=rtl] .md-typeset .inline{margin-left:.8rem}.md-typeset .inline{margin-bottom:.8rem;margin-top:0;width:11.7rem}[dir=ltr] .md-typeset .inline.end{float:right}[dir=rtl] .md-typeset .inline.end{float:left}[dir=ltr] .md-typeset .inline.end{margin-left:.8rem;margin-right:0}[dir=rtl] .md-typeset .inline.end{margin-left:0;margin-right:.8rem}}
\ No newline at end of file
diff --git a/assets/stylesheets/main.76a95c52.min.css.map b/assets/stylesheets/main.76a95c52.min.css.map
new file mode 100644
index 0000000..ee35967
--- /dev/null
+++ b/assets/stylesheets/main.76a95c52.min.css.map
@@ -0,0 +1 @@
+{"version":3,"sources":["src/templates/assets/stylesheets/main/components/_meta.scss","../../../../src/templates/assets/stylesheets/main.scss","src/templates/assets/stylesheets/main/_resets.scss","src/templates/assets/stylesheets/main/_colors.scss","src/templates/assets/stylesheets/main/_icons.scss","src/templates/assets/stylesheets/main/_typeset.scss","src/templates/assets/stylesheets/utilities/_break.scss","src/templates/assets/stylesheets/main/components/_author.scss","src/templates/assets/stylesheets/main/components/_banner.scss","src/templates/assets/stylesheets/main/components/_base.scss","src/templates/assets/stylesheets/main/components/_clipboard.scss","src/templates/assets/stylesheets/main/components/_code.scss","src/templates/assets/stylesheets/main/components/_consent.scss","src/templates/assets/stylesheets/main/components/_content.scss","src/templates/assets/stylesheets/main/components/_dialog.scss","src/templates/assets/stylesheets/main/components/_feedback.scss","src/templates/assets/stylesheets/main/components/_footer.scss","src/templates/assets/stylesheets/main/components/_form.scss","src/templates/assets/stylesheets/main/components/_header.scss","node_modules/material-design-color/material-color.scss","src/templates/assets/stylesheets/main/components/_nav.scss","src/templates/assets/stylesheets/main/components/_pagination.scss","src/templates/assets/stylesheets/main/components/_post.scss","src/templates/assets/stylesheets/main/components/_progress.scss","src/templates/assets/stylesheets/main/components/_search.scss","src/templates/assets/stylesheets/main/components/_select.scss","src/templates/assets/stylesheets/main/components/_sidebar.scss","src/templates/assets/stylesheets/main/components/_source.scss","src/templates/assets/stylesheets/main/components/_status.scss","src/templates/assets/stylesheets/main/components/_tabs.scss","src/templates/assets/stylesheets/main/components/_tag.scss","src/templates/assets/stylesheets/main/components/_tooltip.scss","src/templates/assets/stylesheets/main/components/_tooltip2.scss","src/templates/assets/stylesheets/main/components/_top.scss","src/templates/assets/stylesheets/main/components/_version.scss","src/templates/assets/stylesheets/main/extensions/markdown/_admonition.scss","src/templates/assets/stylesheets/main/extensions/markdown/_footnotes.scss","src/templates/assets/stylesheets/main/extensions/markdown/_toc.scss","src/templates/assets/stylesheets/main/extensions/pymdownx/_arithmatex.scss","src/templates/assets/stylesheets/main/extensions/pymdownx/_critic.scss","src/templates/assets/stylesheets/main/extensions/pymdownx/_details.scss","src/templates/assets/stylesheets/main/extensions/pymdownx/_emoji.scss","src/templates/assets/stylesheets/main/extensions/pymdownx/_highlight.scss","src/templates/assets/stylesheets/main/extensions/pymdownx/_keys.scss","src/templates/assets/stylesheets/main/extensions/pymdownx/_tabbed.scss","src/templates/assets/stylesheets/main/extensions/pymdownx/_tasklist.scss","src/templates/assets/stylesheets/main/integrations/_mermaid.scss","src/templates/assets/stylesheets/main/modifiers/_grid.scss","src/templates/assets/stylesheets/main/modifiers/_inline.scss"],"names":[],"mappings":"AA0CE,gBCqxCF,CCnyCA,KAEE,6BAAA,CAAA,0BAAA,CAAA,qBAAA,CADA,qBDzBF,CC8BA,iBAGE,kBD3BF,CC8BE,gCANF,iBAOI,yBDzBF,CACF,CC6BA,KACE,QD1BF,CC8BA,qBAIE,uCD3BF,CC+BA,EACE,aAAA,CACA,oBD5BF,CCgCA,GAME,QAAA,CALA,kBAAA,CACA,aAAA,CACA,aAAA,CAEA,gBAAA,CADA,SD3BF,CCiCA,MACE,aD9BF,CCkCA,QAEE,eD/BF,CCmCA,IACE,iBDhCF,CCoCA,MAEE,uBAAA,CADA,gBDhCF,CCqCA,MAEE,eAAA,CACA,kBDlCF,CCsCA,OAKE,gBAAA,CACA,QAAA,CAHA,mBAAA,CACA,iBAAA,CAFA,QAAA,CADA,SD9BF,CCuCA,MACE,QAAA,CACA,YDpCF,CErDA,MAIE,6BAAA,CACA,oCAAA,CACA,mCAAA,CACA,0BAAA,CACA,sCAAA,CAGA,4BAAA,CACA,2CAAA,CACA,yBAAA,CACA,qCFmDF,CE7CA,+BAIE,kBF6CF,CE1CE,oHAEE,YF4CJ,CEnCA,qCAIE,eAAA,CAGA,+BAAA,CACA,sCAAA,CACA,wCAAA,CACA,yCAAA,CACA,0BAAA,CACA,sCAAA,CACA,wCAAA,CACA,yCAAA,CAGA,0BAAA,CACA,0BAAA,CAGA,0BAAA,CACA,mCAAA,CAGA,iCAAA,CACA,kCAAA,CACA,mCAAA,CACA,mCAAA,CACA,kCAAA,CACA,iCAAA,CACA,+CAAA,CACA,6DAAA,CACA,gEAAA,CACA,4DAAA,CACA,4DAAA,CACA,6DAAA,CAGA,6CAAA,CAGA,+CAAA,CAGA,gCAAA,CACA,gCAAA,CAGA,8BAAA,CACA,kCAAA,CACA,qCAAA,CAGA,iCAAA,CAGA,kCAAA,CACA,gDAAA,CAGA,mDAAA,CACA,mDAAA,CAGA,+BAAA,CACA,0BAAA,CAGA,yBAAA,CACA,qCAAA,CACA,uCAAA,CACA,8BAAA,CACA,oCAAA,CAGA,8DAAA,CAKA,8DAAA,CAKA,0DFKF,CG9HE,aAIE,iBAAA,CAHA,aAAA,CAEA,aAAA,CADA,YHmIJ,CIxIA,KACE,kCAAA,CACA,iCAAA,CAGA,uGAAA,CAKA,mFJyIF,CInIA,iBAIE,mCAAA,CACA,6BAAA,CAFA,sCJwIF,CIlIA,aAIE,4BAAA,CADA,sCJsIF,CI7HA,MACE,0NAAA,CACA,mNAAA,CACA,oNJgIF,CIzHA,YAGE,gCAAA,CAAA,kBAAA,CAFA,eAAA,CACA,eJ6HF,CIxHE,aAPF,YAQI,gBJ2HF,CACF,CIxHE,uGAME,iBAAA,CAAA,cJ0HJ,CItHE,eAKE,uCAAA,CAHA,aAAA,CAEA,eAAA,CAHA,iBJ6HJ,CIpHE,8BAPE,eAAA,CAGA,qBJ+HJ,CI3HE,eAEE,kBAAA,CAEA,eAAA,CAHA,oBJ0HJ,CIlHE,eAEE,gBAAA,CACA,eAAA,CAEA,qBAAA,CADA,eAAA,CAHA,mBJwHJ,CIhHE,kBACE,eJkHJ,CI9GE,eAEE,eAAA,CACA,qBAAA,CAFA,YJkHJ,CI5GE,8BAKE,uCAAA,CAFA,cAAA,CACA,eAAA,CAEA,qBAAA,CAJA,eJkHJ,CI1GE,eACE,wBJ4GJ,CIxGE,eAGE,+DAAA,CAFA,iBAAA,CACA,cJ2GJ,CItGE,cACE,+BAAA,CACA,qBJwGJ,CIrGI,mCAEE,sBJsGN,CIlGI,wCACE,+BJoGN,CIjGM,kDACE,uDJmGR,CI9FI,mBACE,kBAAA,CACA,iCJgGN,CI5FI,4BACE,uCAAA,CACA,oBJ8FN,CIzFE,iDAIE,6BAAA,CACA,aAAA,CAFA,2BJ6FJ,CIxFI,aARF,iDASI,oBJ6FJ,CACF,CIzFE,iBAIE,wCAAA,CACA,mBAAA,CACA,kCAAA,CAAA,0BAAA,CAJA,eAAA,CADA,uBAAA,CAEA,qBJ8FJ,CIxFI,qCAEE,uCAAA,CADA,YJ2FN,CIrFE,gBAEE,iBAAA,CACA,eAAA,CAFA,iBJyFJ,CIpFI,qBASE,kCAAA,CAAA,0BAAA,CADA,eAAA,CAPA,aAAA,CAEA,QAAA,CAIA,uCAAA,CAHA,aAAA,CAFA,oCAAA,CASA,yDAAA,CADA,oBAAA,CAJA,iBAAA,CADA,iBJ4FN,CInFM,2BACE,+CJqFR,CIjFM,wCAEE,YAAA,CADA,WJoFR,CI/EM,8CACE,oDJiFR,CI9EQ,oDACE,0CJgFV,CIzEE,gBAOE,4CAAA,CACA,mBAAA,CACA,mKACE,CANF,gCAAA,CAHA,oBAAA,CAEA,eAAA,CADA,uBAAA,CAIA,uBAAA,CADA,qBJ+EJ,CIpEE,iBAGE,6CAAA,CACA,kCAAA,CAAA,0BAAA,CAHA,aAAA,CACA,qBJwEJ,CIlEE,iBAGE,6DAAA,CADA,WAAA,CADA,oBJsEJ,CIhEE,kBACE,WJkEJ,CI9DE,oDAEE,qBJgEJ,CIlEE,oDAEE,sBJgEJ,CI5DE,iCACE,kBJiEJ,CIlEE,iCACE,mBJiEJ,CIlEE,iCAIE,2DJ8DJ,CIlEE,iCAIE,4DJ8DJ,CIlEE,uBAGE,uCAAA,CADA,aAAA,CAAA,cJgEJ,CI1DE,eACE,oBJ4DJ,CIxDE,kDAGE,kBJ0DJ,CI7DE,kDAGE,mBJ0DJ,CI7DE,8BAEE,SJ2DJ,CIvDI,0DACE,iBJ0DN,CItDI,oCACE,2BJyDN,CItDM,0CACE,2BJyDR,CIpDI,wDACE,kBJwDN,CIzDI,wDACE,mBJwDN,CIzDI,oCAEE,kBJuDN,CIpDM,kGAEE,aJwDR,CIpDM,0DACE,eJuDR,CInDM,4HAEE,kBJsDR,CIxDM,4HAEE,mBJsDR,CIxDM,oFACE,kBAAA,CAAA,eJuDR,CIhDE,yBAEE,mBJkDJ,CIpDE,yBAEE,oBJkDJ,CIpDE,eACE,mBAAA,CAAA,cJmDJ,CI9CE,kDAIE,WAAA,CADA,cJiDJ,CIzCI,4BAEE,oBJ2CN,CIvCI,6BAEE,oBJyCN,CIrCI,kCACE,YJuCN,CIlCE,mBACE,iBAAA,CAGA,eAAA,CADA,cAAA,CAEA,iBAAA,CAHA,sBAAA,CAAA,iBJuCJ,CIjCI,uBACE,aAAA,CACA,aJmCN,CI9BE,uBAGE,iBAAA,CADA,eAAA,CADA,eJkCJ,CI5BE,mBACE,cJ8BJ,CI1BE,+BAME,2CAAA,CACA,iDAAA,CACA,mBAAA,CAPA,oBAAA,CAGA,gBAAA,CAFA,cAAA,CACA,aAAA,CAEA,iBJ+BJ,CIzBI,aAXF,+BAYI,aJ4BJ,CACF,CIvBI,iCACE,gBJyBN,CIlBM,8FACE,YJoBR,CIhBM,4FACE,eJkBR,CIbI,8FACE,eJeN,CIZM,kHACE,gBJcR,CITI,kCAGE,eAAA,CAFA,cAAA,CACA,sBAAA,CAEA,kBJWN,CIPI,kCAGE,qDAAA,CAFA,sBAAA,CACA,kBJUN,CILI,wCACE,iCJON,CIJM,8CACE,qDAAA,CACA,sDJMR,CIDI,iCACE,iBJGN,CIEE,wCACE,cJAJ,CIGI,wDAIE,gBJKN,CITI,wDAIE,iBJKN,CITI,8CAME,UAAA,CALA,oBAAA,CAEA,YAAA,CAKA,oDAAA,CAAA,4CAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CAHA,iCAAA,CAFA,0BAAA,CAHA,WJON,CIKI,oDACE,oDJHN,CIOI,mEACE,kDAAA,CACA,yDAAA,CAAA,iDJLN,CISI,oEACE,kDAAA,CACA,0DAAA,CAAA,kDJPN,CIYE,wBACE,iBAAA,CACA,eAAA,CACA,iBJVJ,CIcE,mBACE,oBAAA,CAEA,kBAAA,CADA,eJXJ,CIeI,aANF,mBAOI,aJZJ,CACF,CIeI,8BACE,aAAA,CAEA,QAAA,CACA,eAAA,CAFA,UJXN,CKnVI,0CD6WF,uBACE,iBJtBF,CIyBE,4BACE,eJvBJ,CACF,CMlhBE,uBAOE,kBAAA,CALA,aAAA,CACA,aAAA,CAEA,aAAA,CACA,eAAA,CALA,iBAAA,CAOA,sCACE,CALF,YNwhBJ,CM/gBI,2BACE,aNihBN,CM7gBI,6BAME,+CAAA,CAFA,yCAAA,CAHA,eAAA,CACA,eAAA,CACA,kBAAA,CAEA,iBNghBN,CM3gBI,6BAEE,aAAA,CADA,YN8gBN,CMxgBE,wBACE,kBN0gBJ,CMvgBI,4BAIE,kBAAA,CAHA,mCAAA,CAIA,uBNugBN,CMngBI,4DAEE,oBAAA,CADA,SNsgBN,CMlgBM,oEACE,mBNogBR,CO7jBA,WAGE,0CAAA,CADA,+BAAA,CADA,aPkkBF,CO7jBE,aANF,WAOI,YPgkBF,CACF,CO7jBE,oBAEE,2CAAA,CADA,gCPgkBJ,CO3jBE,kBAGE,eAAA,CADA,iBAAA,CADA,eP+jBJ,COzjBE,6BACE,WP8jBJ,CO/jBE,6BACE,UP8jBJ,CO/jBE,mBAEE,aAAA,CACA,cAAA,CACA,uBP2jBJ,COxjBI,0BACE,YP0jBN,COtjBI,yBACE,UPwjBN,CQ7lBA,KASE,cAAA,CARA,WAAA,CACA,iBRimBF,CK7bI,oCGtKJ,KAaI,gBR0lBF,CACF,CKlcI,oCGtKJ,KAkBI,cR0lBF,CACF,CQrlBA,KASE,2CAAA,CAPA,YAAA,CACA,qBAAA,CAKA,eAAA,CAHA,eAAA,CAJA,iBAAA,CAGA,UR2lBF,CQnlBE,aAZF,KAaI,aRslBF,CACF,CKncI,0CGhJF,yBAII,cRmlBJ,CACF,CQ1kBA,SAEE,gBAAA,CAAA,iBAAA,CADA,eR8kBF,CQzkBA,cACE,YAAA,CACA,qBAAA,CACA,WR4kBF,CQzkBE,aANF,cAOI,aR4kBF,CACF,CQxkBA,SACE,WR2kBF,CQxkBE,gBACE,YAAA,CACA,WAAA,CACA,iBR0kBJ,CQrkBA,aACE,eAAA,CACA,sBRwkBF,CQ/jBA,WACE,YRkkBF,CQ7jBA,WAGE,QAAA,CACA,SAAA,CAHA,iBAAA,CACA,ORkkBF,CQ7jBE,uCACE,aR+jBJ,CQ3jBE,+BAEE,uCAAA,CADA,kBR8jBJ,CQxjBA,SASE,2CAAA,CACA,mBAAA,CAFA,gCAAA,CADA,gBAAA,CADA,YAAA,CAMA,SAAA,CADA,uCAAA,CANA,mBAAA,CAJA,cAAA,CAYA,2BAAA,CATA,URkkBF,CQtjBE,eAEE,SAAA,CAIA,uBAAA,CAHA,oEACE,CAHF,UR2jBJ,CQ7iBA,MACE,WRgjBF,CSzsBA,MACE,+PT2sBF,CSrsBA,cASE,mBAAA,CAFA,0CAAA,CACA,cAAA,CAFA,YAAA,CAIA,uCAAA,CACA,oBAAA,CAVA,iBAAA,CAEA,UAAA,CADA,QAAA,CAUA,qBAAA,CAPA,WAAA,CADA,STgtBF,CSrsBE,aAfF,cAgBI,YTwsBF,CACF,CSrsBE,kCAEE,uCAAA,CADA,YTwsBJ,CSnsBE,qBACE,uCTqsBJ,CSjsBE,wCACE,+BTmsBJ,CS9rBE,oBAME,6BAAA,CADA,UAAA,CAJA,aAAA,CAEA,cAAA,CACA,aAAA,CAGA,2CAAA,CAAA,mCAAA,CACA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CARA,aTwsBJ,CS5rBE,sBACE,cT8rBJ,CS3rBI,2BACE,2CT6rBN,CSvrBI,kEAEE,uDAAA,CADA,+BT0rBN,CU5vBE,8BACE,YV+vBJ,CWpwBA,mBACE,GACE,SAAA,CACA,0BXuwBF,CWpwBA,GACE,SAAA,CACA,uBXswBF,CACF,CWlwBA,mBACE,GACE,SXowBF,CWjwBA,GACE,SXmwBF,CACF,CWxvBE,qBASE,2BAAA,CADA,mCAAA,CAAA,2BAAA,CAFA,0BAAA,CADA,WAAA,CAEA,SAAA,CANA,cAAA,CACA,KAAA,CAEA,UAAA,CADA,SXgwBJ,CWtvBE,mBAcE,mDAAA,CANA,2CAAA,CACA,QAAA,CACA,mBAAA,CARA,QAAA,CASA,kDACE,CAPF,eAAA,CAEA,aAAA,CADA,SAAA,CALA,cAAA,CAGA,UAAA,CADA,SXiwBJ,CWlvBE,kBACE,aXovBJ,CWhvBE,sBACE,YAAA,CACA,YXkvBJ,CW/uBI,oCACE,aXivBN,CW5uBE,sBACE,mBX8uBJ,CW3uBI,6CACE,cX6uBN,CKvoBI,0CMvGA,6CAKI,aAAA,CAEA,gBAAA,CACA,iBAAA,CAFA,UX+uBN,CACF,CWxuBE,kBACE,cX0uBJ,CY30BA,YACE,WAAA,CAIA,WZ20BF,CYx0BE,mBAEE,qBAAA,CADA,iBZ20BJ,CK9qBI,sCOtJE,4EACE,kBZu0BN,CYn0BI,0JACE,mBZq0BN,CYt0BI,8EACE,kBZq0BN,CACF,CYh0BI,0BAGE,UAAA,CAFA,aAAA,CACA,YZm0BN,CY9zBI,+BACE,eZg0BN,CY1zBE,8BACE,WZ+zBJ,CYh0BE,8BACE,UZ+zBJ,CYh0BE,8BAIE,iBZ4zBJ,CYh0BE,8BAIE,kBZ4zBJ,CYh0BE,oBAGE,cAAA,CADA,SZ8zBJ,CYzzBI,aAPF,oBAQI,YZ4zBJ,CACF,CYzzBI,gCACE,yCZ2zBN,CYvzBI,wBACE,cAAA,CACA,kBZyzBN,CYtzBM,kCACE,oBZwzBR,Caz3BA,qBAeE,Wb03BF,Caz4BA,qBAeE,Ub03BF,Caz4BA,WAOE,2CAAA,CACA,mBAAA,CANA,YAAA,CAOA,8BAAA,CALA,iBAAA,CAMA,SAAA,CALA,mBAAA,CACA,mBAAA,CALA,cAAA,CAaA,0BAAA,CAHA,wCACE,CATF,Sbs4BF,Cav3BE,aAlBF,WAmBI,Yb03BF,CACF,Cav3BE,mBAEE,SAAA,CADA,mBAAA,CAKA,uBAAA,CAHA,kEb03BJ,Can3BE,kBAEE,gCAAA,CADA,ebs3BJ,Ccx5BA,aACE,gBAAA,CACA,iBd25BF,Ccx5BE,sBAGE,WAAA,CADA,QAAA,CADA,Sd45BJ,Cct5BE,oBAEE,eAAA,CADA,edy5BJ,Ccp5BE,oBACE,iBds5BJ,Ccl5BE,mBAEE,YAAA,CACA,cAAA,CACA,6BAAA,CAHA,iBdu5BJ,Ccj5BI,iDACE,yCdm5BN,Cc/4BI,6BACE,iBdi5BN,Cc54BE,mBAGE,uCAAA,CACA,cAAA,CAHA,aAAA,CACA,cAAA,CAGA,sBd84BJ,Cc34BI,gDACE,+Bd64BN,Ccz4BI,4BACE,0CAAA,CACA,mBd24BN,Cct4BE,mBAEE,SAAA,CADA,iBAAA,CAKA,2BAAA,CAHA,8Ddy4BJ,Ccn4BI,qBAEE,aAAA,CADA,eds4BN,Ccj4BI,6BACE,SAAA,CACA,uBdm4BN,Cej9BA,WAEE,0CAAA,CADA,+Bfq9BF,Cej9BE,aALF,WAMI,Yfo9BF,CACF,Cej9BE,kBACE,6BAAA,CAEA,aAAA,CADA,afo9BJ,Ceh9BI,gCACE,Yfk9BN,Ce78BE,iBAOE,eAAA,CANA,YAAA,CAKA,cAAA,CAGA,mBAAA,CAAA,eAAA,CADA,cAAA,CAGA,uCAAA,CADA,eAAA,CAEA,uBf28BJ,Cex8BI,8CACE,Uf08BN,Cet8BI,+BACE,oBfw8BN,CK1zBI,0CUvIE,uBACE,afo8BN,Cej8BM,yCACE,Yfm8BR,CACF,Ce97BI,iCACE,gBfi8BN,Cel8BI,iCACE,iBfi8BN,Cel8BI,uBAEE,gBfg8BN,Ce77BM,iCACE,ef+7BR,Cez7BE,kBACE,WAAA,CAIA,eAAA,CADA,mBAAA,CAFA,6BAAA,CACA,cAAA,CAGA,kBf27BJ,Cev7BE,mBAEE,YAAA,CADA,af07BJ,Cer7BE,sBACE,gBAAA,CACA,Ufu7BJ,Cel7BA,gBACE,gDfq7BF,Cel7BE,uBACE,YAAA,CACA,cAAA,CACA,6BAAA,CACA,afo7BJ,Ceh7BE,kCACE,sCfk7BJ,Ce/6BI,gFACE,+Bfi7BN,Cez6BA,cAKE,wCAAA,CADA,gBAAA,CADA,iBAAA,CADA,eAAA,CADA,Ufg7BF,CKp4BI,mCU7CJ,cASI,Uf46BF,CACF,Cex6BE,yBACE,sCf06BJ,Cen6BA,WACE,mBAAA,CACA,SAAA,CAEA,cAAA,CADA,qBfu6BF,CKn5BI,mCUvBJ,WAQI,efs6BF,CACF,Cen6BE,iBACE,oBAAA,CAEA,aAAA,CACA,iBAAA,CAFA,Yfu6BJ,Cel6BI,wBACE,efo6BN,Ceh6BI,qBAGE,iBAAA,CAFA,gBAAA,CACA,mBfm6BN,CgBzkCE,uBAME,kBAAA,CACA,mBAAA,CAHA,gCAAA,CACA,cAAA,CAJA,oBAAA,CAEA,eAAA,CADA,kBAAA,CAMA,gEhB4kCJ,CgBtkCI,gCAEE,2CAAA,CACA,uCAAA,CAFA,gChB0kCN,CgBpkCI,0DAEE,0CAAA,CACA,sCAAA,CAFA,+BhBwkCN,CgBjkCE,gCAKE,4BhBskCJ,CgB3kCE,gEAME,6BhBqkCJ,CgB3kCE,gCAME,4BhBqkCJ,CgB3kCE,sBAIE,6DAAA,CAGA,8BAAA,CAJA,eAAA,CAFA,aAAA,CACA,eAAA,CAMA,sChBmkCJ,CgB9jCI,wDACE,6CAAA,CACA,8BhBgkCN,CgB5jCI,+BACE,UhB8jCN,CiBjnCA,WAOE,2CAAA,CAGA,8CACE,CALF,gCAAA,CADA,aAAA,CAHA,MAAA,CADA,eAAA,CACA,OAAA,CACA,KAAA,CACA,SjBwnCF,CiB7mCE,aAfF,WAgBI,YjBgnCF,CACF,CiB7mCE,mBAIE,2BAAA,CAHA,iEjBgnCJ,CiBzmCE,mBACE,kDACE,CAEF,kEjBymCJ,CiBnmCE,kBAEE,kBAAA,CADA,YAAA,CAEA,ejBqmCJ,CiBjmCE,mBAKE,kBAAA,CAEA,cAAA,CAHA,YAAA,CAIA,uCAAA,CALA,aAAA,CAFA,iBAAA,CAQA,uBAAA,CAHA,qBAAA,CAJA,SjB0mCJ,CiBhmCI,yBACE,UjBkmCN,CiB9lCI,iCACE,oBjBgmCN,CiB5lCI,uCAEE,uCAAA,CADA,YjB+lCN,CiB1lCI,2BAEE,YAAA,CADA,ajB6lCN,CK/+BI,0CY/GA,2BAMI,YjB4lCN,CACF,CiBzlCM,8DAIE,iBAAA,CAHA,aAAA,CAEA,aAAA,CADA,UjB6lCR,CK7gCI,mCYzEA,iCAII,YjBslCN,CACF,CiBnlCM,wCACE,YjBqlCR,CiBjlCM,+CACE,oBjBmlCR,CKxhCI,sCYtDA,iCAII,YjB8kCN,CACF,CiBzkCE,kBAEE,YAAA,CACA,cAAA,CAFA,iBAAA,CAIA,8DACE,CAFF,kBjB4kCJ,CiBtkCI,oCAGE,SAAA,CADA,mBAAA,CAKA,6BAAA,CAHA,8DACE,CAJF,UjB4kCN,CiBnkCM,8CACE,8BjBqkCR,CiBhkCI,8BACE,ejBkkCN,CiB7jCE,4BAGE,gBAAA,CAAA,kBjBikCJ,CiBpkCE,4BAGE,iBAAA,CAAA,iBjBikCJ,CiBpkCE,kBACE,WAAA,CAGA,eAAA,CAFA,aAAA,CAGA,kBjB+jCJ,CiB5jCI,4CAGE,SAAA,CADA,mBAAA,CAKA,8BAAA,CAHA,8DACE,CAJF,UjBkkCN,CiBzjCM,sDACE,6BjB2jCR,CiBvjCM,8DAGE,SAAA,CADA,mBAAA,CAKA,uBAAA,CAHA,8DACE,CAJF,SjB6jCR,CiBljCI,uCAGE,WAAA,CAFA,iBAAA,CACA,UjBqjCN,CiB/iCE,mBACE,YAAA,CACA,aAAA,CACA,cAAA,CAEA,+CACE,CAFF,kBjBkjCJ,CiB5iCI,8DACE,WAAA,CACA,SAAA,CACA,oCjB8iCN,CiBriCI,yBACE,QjBuiCN,CiBliCE,mBACE,YjBoiCJ,CKhmCI,mCY2DF,6BAQI,gBjBoiCJ,CiB5iCA,6BAQI,iBjBoiCJ,CiB5iCA,mBAKI,aAAA,CAEA,iBAAA,CADA,ajBsiCJ,CACF,CKxmCI,sCY2DF,6BAaI,kBjBoiCJ,CiBjjCA,6BAaI,mBjBoiCJ,CACF,CDnxCA,SAGE,uCAAA,CAFA,eAAA,CACA,eCuxCF,CDnxCE,eACE,mBAAA,CACA,cAAA,CAGA,eAAA,CADA,QAAA,CADA,SCuxCJ,CDjxCE,sCAEE,WAAA,CADA,iBAAA,CAAA,kBCoxCJ,CD/wCE,eACE,+BCixCJ,CD9wCI,0CACE,+BCgxCN,CD1wCA,UAKE,wBmBaa,CnBZb,oBAAA,CAFA,UAAA,CAHA,oBAAA,CAEA,eAAA,CADA,0BAAA,CAAA,2BCixCF,CmBnzCA,MACE,0MAAA,CACA,gMAAA,CACA,yNnBszCF,CmBhzCA,QACE,eAAA,CACA,enBmzCF,CmBhzCE,eAKE,uCAAA,CAJA,aAAA,CAGA,eAAA,CADA,eAAA,CADA,eAAA,CAIA,sBnBkzCJ,CmB/yCI,+BACE,YnBizCN,CmB9yCM,mCAEE,WAAA,CADA,UnBizCR,CmBzyCQ,sFAME,iBAAA,CALA,aAAA,CAGA,aAAA,CADA,cAAA,CAEA,kBAAA,CAHA,UnB+yCV,CmBpyCE,cAGE,eAAA,CADA,QAAA,CADA,SnBwyCJ,CmBlyCE,cAGE,sBAAA,CAFA,YAAA,CACA,SAAA,CAEA,iBAAA,CAEA,uBAAA,CADA,sBnBqyCJ,CmBjyCI,sBACE,uCnBmyCN,CmB5xCM,6EAEE,+BnB8xCR,CmBzxCI,2BAIE,iBnBwxCN,CmBpxCI,4CACE,gBnBsxCN,CmBvxCI,4CACE,iBnBsxCN,CmBlxCI,kBAGE,iBAAA,CAFA,aAAA,CACA,YnBqxCN,CmBhxCI,sGACE,+BAAA,CACA,cnBkxCN,CmB9wCI,4BACE,uCAAA,CACA,oBnBgxCN,CmB5wCI,0CACE,YnB8wCN,CmB3wCM,yDAKE,6BAAA,CAJA,aAAA,CAEA,WAAA,CACA,qCAAA,CAAA,6BAAA,CAFA,UnBgxCR,CmBzwCM,kDACE,YnB2wCR,CmBrwCE,iCACE,YnBuwCJ,CmBpwCI,6CACE,WAAA,CAGA,WnBowCN,CmB/vCE,cACE,anBiwCJ,CmB7vCE,gBACE,YnB+vCJ,CK7tCI,0Cc3BA,0CASE,2CAAA,CAHA,YAAA,CACA,qBAAA,CACA,WAAA,CALA,MAAA,CADA,iBAAA,CACA,OAAA,CACA,KAAA,CACA,SnB8vCJ,CmBnvCI,+DACE,eAAA,CACA,enBqvCN,CmBjvCI,gCAQE,qDAAA,CAHA,uCAAA,CAEA,cAAA,CALA,aAAA,CAEA,kBAAA,CADA,wBAAA,CAFA,iBAAA,CAKA,kBnBqvCN,CmBhvCM,wDAGE,UnBsvCR,CmBzvCM,wDAGE,WnBsvCR,CmBzvCM,8CAIE,aAAA,CAEA,aAAA,CACA,YAAA,CANA,iBAAA,CACA,SAAA,CAGA,YnBovCR,CmB/uCQ,oDAKE,6BAAA,CADA,UAAA,CAHA,aAAA,CAEA,WAAA,CAGA,2CAAA,CAAA,mCAAA,CACA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CAPA,UnBwvCV,CmB5uCM,8CAGE,2CAAA,CACA,gEACE,CAJF,eAAA,CAKA,4BAAA,CAJA,kBnBivCR,CmB1uCQ,2DACE,YnB4uCV,CmBvuCM,8CAGE,2CAAA,CADA,gCAAA,CADA,enB2uCR,CmBruCM,yCAIE,aAAA,CAFA,UAAA,CAIA,YAAA,CADA,aAAA,CAJA,iBAAA,CACA,WAAA,CACA,SnB0uCR,CmBluCI,+BACE,MnBouCN,CmBhuCI,+BACE,4DnBkuCN,CmB/tCM,qDACE,+BnBiuCR,CmB9tCQ,sHACE,+BnBguCV,CmB1tCI,+BAEE,YAAA,CADA,mBnB6tCN,CmBztCM,mCACE,enB2tCR,CmBvtCM,6CACE,SnBytCR,CmBrtCM,uDAGE,mBnBwtCR,CmB3tCM,uDAGE,kBnBwtCR,CmB3tCM,6CAIE,gBAAA,CAFA,aAAA,CADA,YnB0tCR,CmBptCQ,mDAKE,6BAAA,CADA,UAAA,CAHA,aAAA,CAEA,WAAA,CAGA,2CAAA,CAAA,mCAAA,CACA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CAPA,UnB6tCV,CmB7sCM,+CACE,mBnB+sCR,CmBvsCM,4CAEE,wBAAA,CADA,enB0sCR,CmBtsCQ,oEACE,mBnBwsCV,CmBzsCQ,oEACE,oBnBwsCV,CmBpsCQ,4EACE,iBnBssCV,CmBvsCQ,4EACE,kBnBssCV,CmBlsCQ,oFACE,mBnBosCV,CmBrsCQ,oFACE,oBnBosCV,CmBhsCQ,4FACE,mBnBksCV,CmBnsCQ,4FACE,oBnBksCV,CmB3rCE,mBACE,wBnB6rCJ,CmBzrCE,wBACE,YAAA,CACA,SAAA,CAIA,0BAAA,CAHA,oEnB4rCJ,CmBtrCI,kCACE,2BnBwrCN,CmBnrCE,gCACE,SAAA,CAIA,uBAAA,CAHA,qEnBsrCJ,CmBhrCI,8CAEE,kCAAA,CAAA,0BnBirCN,CACF,CKh3CI,0CcuMA,0CACE,YnB4qCJ,CmBzqCI,yDACE,UnB2qCN,CmBvqCI,wDACE,YnByqCN,CmBrqCI,kDACE,YnBuqCN,CmBlqCE,gBAIE,iDAAA,CADA,gCAAA,CAFA,aAAA,CACA,enBsqCJ,CACF,CK76CM,+DcgRF,6CACE,YnBgqCJ,CmB7pCI,4DACE,UnB+pCN,CmB3pCI,2DACE,YnB6pCN,CmBzpCI,qDACE,YnB2pCN,CACF,CKr6CI,mCc7JJ,QA6aI,oBnBypCF,CmBnpCI,kCAME,qCAAA,CACA,qDAAA,CANA,eAAA,CACA,KAAA,CAGA,SnBqpCN,CmBhpCM,6CACE,uBnBkpCR,CmB9oCM,gDACE,YnBgpCR,CmB3oCI,2CACE,kBnB8oCN,CmB/oCI,2CACE,mBnB8oCN,CmB/oCI,iCAEE,oBnB6oCN,CmBtoCI,yDACE,kBnBwoCN,CmBzoCI,yDACE,iBnBwoCN,CACF,CK97CI,sCc7JJ,QAydI,oBAAA,CACA,oDnBsoCF,CmBhoCI,gCAME,qCAAA,CACA,qDAAA,CANA,eAAA,CACA,KAAA,CAGA,SnBkoCN,CmB7nCM,8CACE,uBnB+nCR,CmB3nCM,8CACE,YnB6nCR,CmBxnCI,yCACE,kBnB2nCN,CmB5nCI,yCACE,mBnB2nCN,CmB5nCI,+BAEE,oBnB0nCN,CmBnnCI,uDACE,kBnBqnCN,CmBtnCI,uDACE,iBnBqnCN,CmBhnCE,wBACE,YAAA,CACA,sBAAA,CAEA,SAAA,CACA,6FACE,CAHF,mBnBonCJ,CmB5mCI,sCACE,enB8mCN,CmBzmCE,iFACE,sBAAA,CAEA,SAAA,CACA,4FACE,CAHF,kBnB6mCJ,CmBpmCE,iDACE,enBsmCJ,CmBlmCE,6CACE,YnBomCJ,CmBhmCE,uBACE,aAAA,CACA,enBkmCJ,CmB/lCI,kCACE,enBimCN,CmB7lCI,qCACE,enB+lCN,CmB5lCM,0CACE,uCnB8lCR,CmB1lCM,6DACE,mBnB4lCR,CmBxlCM,yFAEE,YnB0lCR,CmBrlCI,yCAEE,kBnBylCN,CmB3lCI,yCAEE,mBnBylCN,CmB3lCI,+BACE,aAAA,CAGA,SAAA,CADA,kBnBwlCN,CmBplCM,2DACE,SnBslCR,CmBhlCE,cAGE,kBAAA,CADA,YAAA,CAEA,gCAAA,CAHA,WnBqlCJ,CmB/kCI,oBACE,uDnBilCN,CmB7kCI,oBAME,6BAAA,CACA,kBAAA,CAFA,UAAA,CAJA,oBAAA,CAEA,WAAA,CAMA,2CAAA,CAAA,mCAAA,CACA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CAJA,yBAAA,CAJA,qBAAA,CAFA,UnBylCN,CmB5kCM,8BACE,wBnB8kCR,CmB1kCM,kKAEE,uBnB2kCR,CmB7jCI,2EACE,YnBkkCN,CmB/jCM,oDACE,anBikCR,CmB9jCQ,kEAKE,qCAAA,CACA,qDAAA,CAFA,YAAA,CAHA,eAAA,CACA,KAAA,CACA,SnBmkCV,CmB7jCU,0FACE,mBnB+jCZ,CmB1jCQ,0EACE,QnB4jCV,CmBvjCM,sFACE,kBnByjCR,CmB1jCM,sFACE,mBnByjCR,CmBrjCM,kDACE,uCnBujCR,CmBjjCI,2CACE,sBAAA,CAEA,SAAA,CADA,kBnBojCN,CmB3iCI,qFAIE,mDnB8iCN,CmBljCI,qFAIE,oDnB8iCN,CmBljCI,2EACE,aAAA,CACA,oBAAA,CAGA,SAAA,CAFA,kBnB+iCN,CmB1iCM,yFAEE,gBAAA,CADA,gBnB6iCR,CmBxiCM,0FACE,YnB0iCR,CACF,CoB9vDA,eAKE,eAAA,CACA,eAAA,CAJA,SpBqwDF,CoB9vDE,gCANA,kBAAA,CAFA,YAAA,CAGA,sBpB4wDF,CoBvwDE,iBAOE,mBAAA,CAFA,aAAA,CADA,gBAAA,CAEA,iBpBiwDJ,CoB5vDE,wBAEE,qDAAA,CADA,uCpB+vDJ,CoB1vDE,qBACE,6CpB4vDJ,CoBvvDI,sDAEE,uDAAA,CADA,+BpB0vDN,CoBtvDM,8DACE,+BpBwvDR,CoBnvDI,mCACE,uCAAA,CACA,oBpBqvDN,CoBjvDI,yBAKE,iBAAA,CADA,yCAAA,CAHA,aAAA,CAEA,eAAA,CADA,YpBsvDN,CqBtyDE,eAGE,+DAAA,CADA,oBAAA,CADA,qBrB2yDJ,CKtnDI,0CgBtLF,eAOI,YrByyDJ,CACF,CqBnyDM,6BACE,oBrBqyDR,CqB/xDE,kBACE,YAAA,CACA,qBAAA,CACA,SAAA,CACA,qBrBiyDJ,CqB1xDI,0BACE,sBrB4xDN,CqBzxDM,gEACE,+BrB2xDR,CqBrxDE,gBAEE,uCAAA,CADA,erBwxDJ,CqBnxDE,kBACE,oBrBqxDJ,CqBlxDI,mCAGE,kBAAA,CAFA,YAAA,CACA,SAAA,CAEA,iBrBoxDN,CqBhxDI,oCAIE,kBAAA,CAHA,mBAAA,CACA,kBAAA,CACA,SAAA,CAGA,QAAA,CADA,iBrBmxDN,CqB9wDI,0DACE,kBrBgxDN,CqBjxDI,0DACE,iBrBgxDN,CqB5wDI,iDACE,uBAAA,CAEA,YrB6wDN,CqBxwDE,4BACE,YrB0wDJ,CqBnwDA,YAGE,kBAAA,CAFA,YAAA,CAIA,eAAA,CAHA,SAAA,CAIA,eAAA,CAFA,UrBwwDF,CqBnwDE,yBACE,WrBqwDJ,CqB9vDA,kBACE,YrBiwDF,CKzrDI,0CgBzEJ,kBAKI,wBrBiwDF,CACF,CqB9vDE,qCACE,WrBgwDJ,CKptDI,sCgB7CF,+CAKI,kBrBgwDJ,CqBrwDA,+CAKI,mBrBgwDJ,CACF,CKtsDI,0CgBrDJ,6BAMI,SAAA,CAFA,eAAA,CACA,UrB6vDF,CqB1vDE,qDACE,gBrB4vDJ,CqBzvDE,gDACE,SrB2vDJ,CqBxvDE,4CACE,iBAAA,CAAA,kBrB0vDJ,CqBvvDE,2CAEE,WAAA,CADA,crB0vDJ,CqBtvDE,2CACE,mBAAA,CACA,cAAA,CACA,SAAA,CACA,oBAAA,CAAA,iBrBwvDJ,CqBrvDE,2CACE,SrBuvDJ,CqBpvDE,qCAEE,WAAA,CACA,eAAA,CAFA,erBwvDJ,CACF,CsBl6DA,MACE,qBAAA,CACA,yBtBq6DF,CsB/5DA,aAME,qCAAA,CADA,cAAA,CAEA,0FACE,CAPF,cAAA,CACA,KAAA,CAaA,mDAAA,CACA,qBAAA,CAJA,wFACE,CATF,UAAA,CADA,StBy6DF,CuBp7DA,MACE,igBvBu7DF,CuBj7DA,WACE,iBvBo7DF,CKtxDI,mCkB/JJ,WAKI,evBo7DF,CACF,CuBj7DE,kBACE,YvBm7DJ,CuB/6DE,oBAEE,SAAA,CADA,SvBk7DJ,CK/wDI,0CkBpKF,8BAkBI,YvB+6DJ,CuBj8DA,8BAkBI,avB+6DJ,CuBj8DA,oBAYI,2CAAA,CACA,kBAAA,CAJA,WAAA,CACA,eAAA,CACA,mBAAA,CALA,iBAAA,CACA,SAAA,CAUA,uBAAA,CAHA,4CACE,CAPF,UvBy7DJ,CuB56DI,+DACE,SAAA,CACA,oCvB86DN,CACF,CKrzDI,mCkBjJF,8BAyCI,MvBw6DJ,CuBj9DA,8BAyCI,OvBw6DJ,CuBj9DA,oBAoCI,0BAAA,CADA,cAAA,CADA,QAAA,CAHA,cAAA,CACA,KAAA,CAKA,sDACE,CALF,OvBg7DJ,CuBr6DI,+DAME,YAAA,CACA,SAAA,CACA,4CACE,CARF,UvB06DN,CACF,CKpzDI,0CkBxGA,+DAII,mBvB45DN,CACF,CKl2DM,+DkB/DF,+DASI,mBvB45DN,CACF,CKv2DM,+DkB/DF,+DAcI,mBvB45DN,CACF,CuBv5DE,kBAEE,kCAAA,CAAA,0BvBw5DJ,CKt0DI,0CkBpFF,4BAmBI,MvBo5DJ,CuBv6DA,4BAmBI,OvBo5DJ,CuBv6DA,kBAUI,QAAA,CAEA,SAAA,CADA,eAAA,CALA,cAAA,CACA,KAAA,CAWA,wBAAA,CALA,qGACE,CALF,OAAA,CADA,SvB+5DJ,CuBj5DI,4BACE,yBvBm5DN,CuB/4DI,6DAEE,WAAA,CACA,SAAA,CAMA,uBAAA,CALA,sGACE,CAJF,UvBq5DN,CACF,CKj3DI,mCkBjEF,4BA2CI,WvB+4DJ,CuB17DA,4BA2CI,UvB+4DJ,CuB17DA,kBA6CI,eAAA,CAHA,iBAAA,CAIA,8CAAA,CAFA,avB84DJ,CACF,CKh5DM,+DkBOF,6DAII,avBy4DN,CACF,CK/3DI,sCkBfA,6DASI,avBy4DN,CACF,CuBp4DE,iBAIE,2CAAA,CACA,0BAAA,CAFA,aAAA,CAFA,iBAAA,CAKA,2CACE,CALF,SvB04DJ,CK54DI,mCkBAF,iBAaI,0BAAA,CACA,mBAAA,CAFA,avBs4DJ,CuBj4DI,uBACE,0BvBm4DN,CACF,CuB/3DI,4DAEE,2CAAA,CACA,6BAAA,CACA,8BAAA,CAHA,gCvBo4DN,CuB53DE,4BAKE,mBAAA,CAAA,oBvBi4DJ,CuBt4DE,4BAKE,mBAAA,CAAA,oBvBi4DJ,CuBt4DE,kBAQE,gBAAA,CAFA,eAAA,CAFA,WAAA,CAHA,iBAAA,CAMA,sBAAA,CAJA,UAAA,CADA,SvBo4DJ,CuB33DI,+BACE,qBvB63DN,CuBz3DI,kEAEE,uCvB03DN,CuBt3DI,6BACE,YvBw3DN,CK55DI,0CkBaF,kBA8BI,eAAA,CADA,aAAA,CADA,UvBy3DJ,CACF,CKt7DI,mCkBgCF,4BAmCI,mBvBy3DJ,CuB55DA,4BAmCI,oBvBy3DJ,CuB55DA,kBAqCI,aAAA,CADA,evBw3DJ,CuBp3DI,+BACE,uCvBs3DN,CuBl3DI,mCACE,gCvBo3DN,CuBh3DI,6DACE,kBvBk3DN,CuB/2DM,8EACE,uCvBi3DR,CuB72DM,0EACE,WvB+2DR,CACF,CuBz2DE,iBAIE,cAAA,CAHA,oBAAA,CAEA,aAAA,CAEA,kCACE,CAJF,YvB82DJ,CuBt2DI,uBACE,UvBw2DN,CuBp2DI,yCAGE,UvBu2DN,CuB12DI,yCAGE,WvBu2DN,CuB12DI,+BACE,iBAAA,CACA,SAAA,CAEA,SvBs2DN,CuBn2DM,6CACE,oBvBq2DR,CK58DI,0CkB+FA,yCAcI,UvBo2DN,CuBl3DE,yCAcI,WvBo2DN,CuBl3DE,+BAaI,SvBq2DN,CuBj2DM,+CACE,YvBm2DR,CACF,CKx+DI,mCkBkHA,+BAwBI,mBvBk2DN,CuB/1DM,8CACE,YvBi2DR,CACF,CuB31DE,8BAGE,WvB+1DJ,CuBl2DE,8BAGE,UvB+1DJ,CuBl2DE,oBAKE,mBAAA,CAJA,iBAAA,CACA,SAAA,CAEA,SvB81DJ,CKp+DI,0CkBkIF,8BAUI,WvB61DJ,CuBv2DA,8BAUI,UvB61DJ,CuBv2DA,oBASI,SvB81DJ,CACF,CuB11DI,uCACE,iBvBg2DN,CuBj2DI,uCACE,kBvBg2DN,CuBj2DI,6BAEE,uCAAA,CACA,SAAA,CAIA,oBAAA,CAHA,+DvB61DN,CuBv1DM,iDAEE,uCAAA,CADA,YvB01DR,CuBr1DM,gGAGE,SAAA,CADA,mBAAA,CAEA,kBvBs1DR,CuBn1DQ,sGACE,UvBq1DV,CuB90DE,8BAOE,mBAAA,CAAA,oBvBq1DJ,CuB51DE,8BAOE,mBAAA,CAAA,oBvBq1DJ,CuB51DE,oBAIE,kBAAA,CAKA,yCAAA,CANA,YAAA,CAKA,eAAA,CAFA,WAAA,CAKA,SAAA,CAVA,iBAAA,CACA,KAAA,CAUA,uBAAA,CAFA,kBAAA,CALA,UvBu1DJ,CK9hEI,mCkBkMF,8BAgBI,mBvBi1DJ,CuBj2DA,8BAgBI,oBvBi1DJ,CuBj2DA,oBAiBI,evBg1DJ,CACF,CuB70DI,+DACE,SAAA,CACA,0BvB+0DN,CuB10DE,6BAKE,+BvB60DJ,CuBl1DE,0DAME,gCvB40DJ,CuBl1DE,6BAME,+BvB40DJ,CuBl1DE,mBAIE,eAAA,CAHA,iBAAA,CAEA,UAAA,CADA,SvBg1DJ,CK7hEI,0CkB2MF,mBAWI,QAAA,CADA,UvB60DJ,CACF,CKtjEI,mCkB8NF,mBAiBI,SAAA,CADA,UAAA,CAEA,sBvB40DJ,CuBz0DI,8DACE,8BAAA,CACA,SvB20DN,CACF,CuBt0DE,uBASE,kCAAA,CAAA,0BAAA,CAFA,2CAAA,CANA,WAAA,CACA,eAAA,CAIA,kBvBu0DJ,CuBj0DI,iEAZF,uBAaI,uBvBo0DJ,CACF,CKnmEM,+DkBiRJ,uBAkBI,avBo0DJ,CACF,CKllEI,sCkB2PF,uBAuBI,avBo0DJ,CACF,CKvlEI,mCkB2PF,uBA4BI,YAAA,CAEA,yDAAA,CADA,oBvBq0DJ,CuBj0DI,kEACE,evBm0DN,CuB/zDI,6BACE,+CvBi0DN,CuB7zDI,0CAEE,YAAA,CADA,WvBg0DN,CuB3zDI,gDACE,oDvB6zDN,CuB1zDM,sDACE,0CvB4zDR,CACF,CuBrzDA,kBACE,gCAAA,CACA,qBvBwzDF,CuBrzDE,wBAKE,qDAAA,CADA,uCAAA,CAFA,gBAAA,CACA,kBAAA,CAFA,eAAA,CAKA,uBvBuzDJ,CK3nEI,mCkB8TF,kCAUI,mBvBuzDJ,CuBj0DA,kCAUI,oBvBuzDJ,CACF,CuBnzDE,wBAGE,eAAA,CADA,QAAA,CADA,SAAA,CAIA,wBAAA,CAAA,gBvBozDJ,CuBhzDE,wBACE,yDvBkzDJ,CuB/yDI,oCACE,evBizDN,CuB5yDE,wBACE,aAAA,CACA,YAAA,CAEA,uBAAA,CADA,gCvB+yDJ,CuB3yDI,4DACE,uDvB6yDN,CuBzyDI,gDACE,mBvB2yDN,CuBtyDE,gCAKE,cAAA,CADA,aAAA,CAEA,YAAA,CALA,eAAA,CAMA,uBAAA,CALA,KAAA,CACA,SvB4yDJ,CuBryDI,wCACE,YvBuyDN,CuBlyDI,wDACE,YvBoyDN,CuBhyDI,oCAGE,+BAAA,CADA,gBAAA,CADA,mBAAA,CAGA,2CvBkyDN,CK7qEI,mCkBuYA,8CAUI,mBvBgyDN,CuB1yDE,8CAUI,oBvBgyDN,CACF,CuB5xDI,oFAEE,uDAAA,CADA,+BvB+xDN,CuBzxDE,sCACE,2CvB2xDJ,CuBtxDE,2BAGE,eAAA,CADA,eAAA,CADA,iBvB0xDJ,CK9rEI,mCkBmaF,qCAOI,mBvBwxDJ,CuB/xDA,qCAOI,oBvBwxDJ,CACF,CuBpxDE,kCAEE,MvB0xDJ,CuB5xDE,kCAEE,OvB0xDJ,CuB5xDE,wBAME,uCAAA,CAFA,aAAA,CACA,YAAA,CAJA,iBAAA,CAEA,YvByxDJ,CKxrEI,0CkB4ZF,wBAUI,YvBsxDJ,CACF,CuBnxDI,8BAKE,6BAAA,CADA,UAAA,CAHA,oBAAA,CAEA,WAAA,CAGA,+CAAA,CAAA,uCAAA,CACA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CAPA,UvB4xDN,CuBlxDM,wCACE,oBvBoxDR,CuB9wDE,8BAGE,uCAAA,CAFA,gBAAA,CACA,evBixDJ,CuB7wDI,iCAKE,gCAAA,CAHA,eAAA,CACA,eAAA,CACA,eAAA,CAHA,evBmxDN,CuB5wDM,sCACE,oBvB8wDR,CuBzwDI,iCAKE,gCAAA,CAHA,gBAAA,CACA,eAAA,CACA,eAAA,CAHA,avB+wDN,CuBxwDM,sCACE,oBvB0wDR,CuBpwDE,yBAKE,gCAAA,CAJA,aAAA,CAEA,gBAAA,CACA,iBAAA,CAFA,avBywDJ,CuBlwDE,uBAGE,wBAAA,CAFA,+BAAA,CACA,yBvBqwDJ,CwBz6EA,WACE,iBAAA,CACA,SxB46EF,CwBz6EE,kBAOE,2CAAA,CACA,mBAAA,CACA,8BAAA,CAHA,gCAAA,CAHA,QAAA,CAEA,gBAAA,CADA,YAAA,CAMA,SAAA,CATA,iBAAA,CACA,sBAAA,CAaA,mCAAA,CAJA,oExB46EJ,CwBr6EI,6EACE,gBAAA,CACA,SAAA,CAKA,+BAAA,CAJA,8ExBw6EN,CwBh6EI,wBAWE,+BAAA,CAAA,8CAAA,CAFA,6BAAA,CAAA,8BAAA,CACA,YAAA,CAFA,UAAA,CAHA,QAAA,CAFA,QAAA,CAIA,kBAAA,CADA,iBAAA,CALA,iBAAA,CACA,KAAA,CAEA,OxBy6EN,CwB75EE,iBAOE,mBAAA,CAFA,eAAA,CACA,oBAAA,CAHA,QAAA,CAFA,kBAAA,CAGA,aAAA,CAFA,SxBo6EJ,CwB35EE,iBACE,kBxB65EJ,CwBz5EE,2BAGE,kBAAA,CAAA,oBxB+5EJ,CwBl6EE,2BAGE,mBAAA,CAAA,mBxB+5EJ,CwBl6EE,iBAIE,cAAA,CAHA,aAAA,CAIA,YAAA,CAIA,uBAAA,CAHA,2CACE,CALF,UxBg6EJ,CwBt5EI,8CACE,+BxBw5EN,CwBp5EI,uBACE,qDxBs5EN,CyB1+EA,YAIE,qBAAA,CADA,aAAA,CAGA,gBAAA,CALA,eAAA,CACA,UAAA,CAGA,azB8+EF,CyB1+EE,aATF,YAUI,YzB6+EF,CACF,CK/zEI,0CoB3KF,+BAeI,azBw+EJ,CyBv/EA,+BAeI,czBw+EJ,CyBv/EA,qBAUI,2CAAA,CAHA,aAAA,CAEA,WAAA,CALA,cAAA,CACA,KAAA,CASA,uBAAA,CAHA,iEACE,CAJF,aAAA,CAFA,SzBi/EJ,CyBr+EI,mEACE,8BAAA,CACA,6BzBu+EN,CyBp+EM,6EACE,8BzBs+ER,CyBj+EI,6CAEE,QAAA,CAAA,MAAA,CACA,QAAA,CAEA,eAAA,CAJA,iBAAA,CACA,OAAA,CAEA,qBAAA,CAFA,KzBs+EN,CACF,CK92EI,sCoBtKJ,YAuDI,QzBi+EF,CyB99EE,mBACE,WzBg+EJ,CyB59EE,6CACE,UzB89EJ,CACF,CyB19EE,uBACE,YAAA,CACA,OzB49EJ,CK73EI,mCoBjGF,uBAMI,QzB49EJ,CyBz9EI,8BACE,WzB29EN,CyBv9EI,qCACE,azBy9EN,CyBr9EI,+CACE,kBzBu9EN,CACF,CyBl9EE,wBAUE,uBAAA,CANA,kCAAA,CAAA,0BAAA,CAHA,cAAA,CACA,eAAA,CASA,yDAAA,CAFA,oBzBi9EJ,CyB58EI,2CAEE,YAAA,CADA,WzB+8EN,CyB18EI,mEACE,+CzB48EN,CyBz8EM,qHACE,oDzB28ER,CyBx8EQ,iIACE,0CzB08EV,CyB37EE,wCAGE,wBACE,qBzB27EJ,CyBv7EE,6BACE,kCzBy7EJ,CyB17EE,6BACE,iCzBy7EJ,CACF,CKr5EI,0CoB5BF,YAME,0BAAA,CADA,QAAA,CAEA,SAAA,CANA,cAAA,CACA,KAAA,CAMA,sDACE,CALF,OAAA,CADA,SzB07EF,CyB/6EE,4CAEE,WAAA,CACA,SAAA,CACA,4CACE,CAJF,UzBo7EJ,CACF,C0BjmFA,iBACE,GACE,Q1BmmFF,C0BhmFA,GACE,a1BkmFF,CACF,C0B9lFA,gBACE,GACE,SAAA,CACA,0B1BgmFF,C0B7lFA,IACE,S1B+lFF,C0B5lFA,GACE,SAAA,CACA,uB1B8lFF,CACF,C0BtlFA,MACE,+eAAA,CACA,ygBAAA,CACA,mmBAAA,CACA,sf1BwlFF,C0BllFA,WAOE,kCAAA,CAAA,0BAAA,CANA,aAAA,CACA,gBAAA,CACA,eAAA,CAEA,uCAAA,CAGA,uBAAA,CAJA,kB1BwlFF,C0BjlFE,iBACE,U1BmlFJ,C0B/kFE,iBACE,oBAAA,CAEA,aAAA,CACA,qBAAA,CAFA,U1BmlFJ,C0B9kFI,+BACE,iB1BilFN,C0BllFI,+BACE,kB1BilFN,C0BllFI,qBAEE,gB1BglFN,C0B5kFI,kDACE,iB1B+kFN,C0BhlFI,kDACE,kB1B+kFN,C0BhlFI,kDAEE,iB1B8kFN,C0BhlFI,kDAEE,kB1B8kFN,C0BzkFE,iCAGE,iB1B8kFJ,C0BjlFE,iCAGE,kB1B8kFJ,C0BjlFE,uBACE,oBAAA,CACA,6BAAA,CAEA,eAAA,CACA,sBAAA,CACA,qB1B2kFJ,C0BvkFE,kBACE,YAAA,CAMA,gBAAA,CALA,SAAA,CAMA,oBAAA,CAHA,gBAAA,CAIA,WAAA,CAHA,eAAA,CAFA,SAAA,CADA,U1B+kFJ,C0BtkFI,iDACE,4B1BwkFN,C0BnkFE,iBACE,eAAA,CACA,sB1BqkFJ,C0BlkFI,gDACE,2B1BokFN,C0BhkFI,kCAIE,kB1BwkFN,C0B5kFI,kCAIE,iB1BwkFN,C0B5kFI,wBAOE,6BAAA,CADA,UAAA,CALA,oBAAA,CAEA,YAAA,CAKA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CALA,uBAAA,CAHA,W1B0kFN,C0B9jFI,iCACE,a1BgkFN,C0B5jFI,iCACE,gDAAA,CAAA,wC1B8jFN,C0B1jFI,+BACE,8CAAA,CAAA,sC1B4jFN,C0BxjFI,+BACE,8CAAA,CAAA,sC1B0jFN,C0BtjFI,sCACE,qDAAA,CAAA,6C1BwjFN,C0BljFA,gBACE,Y1BqjFF,C0BljFE,gCAIE,kB1BsjFJ,C0B1jFE,gCAIE,iB1BsjFJ,C0B1jFE,sBAGE,kBAAA,CAGA,uCAAA,CALA,mBAAA,CAIA,gBAAA,CAHA,S1BwjFJ,C0BjjFI,+BACE,aAAA,CACA,oB1BmjFN,C0B/iFI,2CACE,U1BkjFN,C0BnjFI,2CACE,W1BkjFN,C0BnjFI,iCAEE,kB1BijFN,C0B7iFI,0BACE,W1B+iFN,C2BtuFA,MACE,mSAAA,CACA,oVAAA,CACA,mOAAA,CACA,qZ3ByuFF,C2BhuFE,iBAME,kDAAA,CADA,UAAA,CAJA,oBAAA,CAEA,cAAA,CAIA,mCAAA,CAAA,2BAAA,CACA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CANA,0BAAA,CAFA,a3B2uFJ,C2B/tFE,uBACE,6B3BiuFJ,C2B7tFE,sBACE,wCAAA,CAAA,gC3B+tFJ,C2B3tFE,6BACE,+CAAA,CAAA,uC3B6tFJ,C2BztFE,4BACE,8CAAA,CAAA,sC3B2tFJ,C4BtwFA,SASE,2CAAA,CADA,gCAAA,CAJA,aAAA,CAGA,eAAA,CADA,aAAA,CADA,UAAA,CAFA,S5B6wFF,C4BpwFE,aAZF,SAaI,Y5BuwFF,CACF,CK5lFI,0CuBzLJ,SAkBI,Y5BuwFF,CACF,C4BpwFE,iBACE,mB5BswFJ,C4BlwFE,yBAIE,iB5BywFJ,C4B7wFE,yBAIE,kB5BywFJ,C4B7wFE,eAQE,eAAA,CAPA,YAAA,CAMA,eAAA,CAJA,QAAA,CAEA,aAAA,CAHA,SAAA,CAWA,oBAAA,CAPA,kB5BuwFJ,C4B7vFI,kCACE,Y5B+vFN,C4B1vFE,eACE,aAAA,CACA,kBAAA,CAAA,mB5B4vFJ,C4BzvFI,sCACE,aAAA,CACA,S5B2vFN,C4BrvFE,eAOE,kCAAA,CAAA,0BAAA,CANA,YAAA,CAEA,eAAA,CADA,gBAAA,CAMA,UAAA,CAJA,uCAAA,CACA,oBAAA,CAIA,8D5BsvFJ,C4BjvFI,0CACE,aAAA,CACA,S5BmvFN,C4B/uFI,6BAEE,kB5BkvFN,C4BpvFI,6BAEE,iB5BkvFN,C4BpvFI,mBAGE,iBAAA,CAFA,Y5BmvFN,C4B5uFM,2CACE,qB5B8uFR,C4B/uFM,2CACE,qB5BivFR,C4BlvFM,2CACE,qB5BovFR,C4BrvFM,2CACE,qB5BuvFR,C4BxvFM,2CACE,oB5B0vFR,C4B3vFM,2CACE,qB5B6vFR,C4B9vFM,2CACE,qB5BgwFR,C4BjwFM,2CACE,qB5BmwFR,C4BpwFM,4CACE,qB5BswFR,C4BvwFM,4CACE,oB5BywFR,C4B1wFM,4CACE,qB5B4wFR,C4B7wFM,4CACE,qB5B+wFR,C4BhxFM,4CACE,qB5BkxFR,C4BnxFM,4CACE,qB5BqxFR,C4BtxFM,4CACE,oB5BwxFR,C4BlxFI,gCACE,SAAA,CAIA,yBAAA,CAHA,wC5BqxFN,C6Bx3FA,MACE,wS7B23FF,C6Bl3FE,mCACE,mBAAA,CACA,cAAA,CACA,QAAA,CAEA,mBAAA,CADA,kB7Bs3FJ,C6Bj3FE,oBAGE,kBAAA,CAOA,+CAAA,CACA,oBAAA,CAVA,mBAAA,CAIA,gBAAA,CACA,0BAAA,CACA,eAAA,CALA,QAAA,CAOA,qBAAA,CADA,eAAA,CAJA,wB7B03FJ,C6Bh3FI,0BAGE,uCAAA,CAFA,aAAA,CACA,YAAA,CAEA,6C7Bk3FN,C6B72FM,gEAEE,0CAAA,CADA,+B7Bg3FR,C6B12FI,yBACE,uB7B42FN,C6Bp2FI,gCAME,oDAAA,CADA,UAAA,CAJA,oBAAA,CAEA,YAAA,CAKA,qCAAA,CAAA,6BAAA,CACA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CAJA,iCAAA,CAHA,0BAAA,CAFA,W7B+2FN,C6Bl2FI,wFACE,0C7Bo2FN,C8B96FA,iBACE,GACE,oB9Bi7FF,C8B96FA,IACE,kB9Bg7FF,C8B76FA,GACE,oB9B+6FF,CACF,C8Bv6FA,MACE,0NAAA,CACA,uP9B06FF,C8Bn6FA,YA6BE,kCAAA,CAAA,0BAAA,CAVA,2CAAA,CACA,mBAAA,CACA,8BAAA,CAHA,gCAAA,CADA,sCAAA,CAdA,+IACE,CAYF,8BAAA,CAMA,SAAA,CArBA,iBAAA,CACA,uBAAA,CAyBA,4BAAA,CAJA,uDACE,CATF,6BAAA,CADA,S9Bu6FF,C8Br5FE,oBAEE,SAAA,CAKA,uBAAA,CAJA,2EACE,CAHF,S9B05FJ,C8Bh5FE,oBAEE,eAAA,CACA,wBAAA,CAAA,gBAAA,CAFA,U9Bo5FJ,C8B/4FI,6CACE,qC9Bi5FN,C8B74FI,uCAEE,eAAA,CADA,mB9Bg5FN,C8B14FI,6BACE,Y9B44FN,C8Bv4FE,8CACE,sC9By4FJ,C8Br4FE,mBAEE,gBAAA,CADA,a9Bw4FJ,C8Bp4FI,2CACE,Y9Bs4FN,C8Bl4FI,0CACE,e9Bo4FN,C8B53FA,eACE,iBAAA,CACA,eAAA,CAIA,YAAA,CAHA,kBAAA,CAEA,0BAAA,CADA,kB9Bi4FF,C8B53FE,yBACE,a9B83FJ,C8B13FE,oBACE,sCAAA,CACA,iB9B43FJ,C8Bx3FE,6BACE,oBAAA,CAGA,gB9Bw3FJ,C8Bp3FE,sBAYE,mBAAA,CANA,cAAA,CAHA,oBAAA,CACA,gBAAA,CAAA,iBAAA,CAIA,YAAA,CAGA,eAAA,CAVA,iBAAA,CAMA,wBAAA,CAAA,gBAAA,CAFA,uBAAA,CAHA,S9B83FJ,C8Bh3FI,qCACE,uB9Bk3FN,C8B92FI,cArBF,sBAsBI,W9Bi3FJ,C8B92FI,wCACE,2B9Bg3FN,C8B52FI,6BAOE,qCAAA,CACA,+CAAA,CAAA,uC9Bi3FN,C8Bv2FI,yDAZE,UAAA,CADA,YAAA,CAIA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CAVA,iBAAA,CACA,SAAA,CAEA,WAAA,CADA,U9Bq4FN,C8Bt3FI,4BAOE,oDAAA,CAMA,4CAAA,CAAA,oCAAA,CADA,uBAAA,CAJA,+C9B82FN,C8Bn2FM,gDACE,uB9Bq2FR,C8Bj2FM,mFACE,0C9Bm2FR,CACF,C8B91FI,0CAGE,2BAAA,CADA,uBAAA,CADA,S9Bk2FN,C8B51FI,8CACE,oB9B81FN,C8B31FM,aAJF,8CASI,8CAAA,CACA,iBAAA,CAHA,gCAAA,CADA,eAAA,CADA,cAAA,CAGA,kB9Bg2FN,C8B31FM,oDACE,mC9B61FR,CACF,C8Bj1FE,gCAEE,iBAAA,CADA,e9Bq1FJ,C8Bj1FI,mCACE,iB9Bm1FN,C8Bh1FM,oDAGE,a9B81FR,C8Bj2FM,oDAGE,c9B81FR,C8Bj2FM,0CAcE,8CAAA,CACA,iBAAA,CALA,gCAAA,CAEA,oBAAA,CACA,qBAAA,CANA,iBAAA,CACA,eAAA,CAHA,UAAA,CAIA,gBAAA,CALA,aAAA,CAEA,cAAA,CALA,iBAAA,CAUA,iBAAA,CATA,S9B+1FR,C+B9mGA,MACE,wBAAA,CACA,wB/BinGF,C+B3mGA,aA+BE,kCAAA,CAAA,0BAAA,CAjBA,gCAAA,CADA,sCAAA,CAGA,SAAA,CADA,mBAAA,CAdA,iBAAA,CAGA,wDACE,CAgBF,4BAAA,CAGA,uEACE,CARF,uDACE,CATF,UAAA,CAGA,S/B8mGF,C+BxlGE,oBAuBE,8CAAA,CAAA,+CAAA,CADA,UAAA,CADA,aAAA,CAfA,gJACE,CANF,iBAAA,CAmBA,S/B4kGJ,C+BrkGE,yBAGE,kEAAA,CAFA,gDAAA,CACA,6C/BwkGJ,C+BnkGE,4BAGE,qEAAA,CADA,8CAAA,CADA,6C/BukGJ,C+BjkGE,qBAEE,SAAA,CAKA,uBAAA,CAJA,wEACE,CAHF,S/BskGJ,C+B5jGE,oBAyBE,uBAAA,CAJA,2CAAA,CACA,mBAAA,CACA,8BAAA,CAjBA,0FACE,CAaF,eAAA,CADA,8BAAA,CAlBA,iBAAA,CAuBA,oB/B+iGJ,C+B3iGI,uCAEE,YAAA,CADA,W/B8iGN,C+BziGI,6CACE,oD/B2iGN,C+BxiGM,mDACE,0C/B0iGR,C+BliGI,mCAwBE,eAAA,CACA,eAAA,CAxBA,oIACE,CAgBF,sCACE,CAIF,mBAAA,CAKA,wBAAA,CAAA,gBAAA,CAbA,sBAAA,CAAA,iB/B4hGN,C+B3gGI,4CACE,Y/B6gGN,C+BzgGI,2CACE,e/B2gGN,CgC9rGA,kBAME,ehC0sGF,CgChtGA,kBAME,gBhC0sGF,CgChtGA,QAUE,2CAAA,CACA,oBAAA,CAEA,8BAAA,CALA,uCAAA,CACA,cAAA,CALA,aAAA,CAGA,eAAA,CAKA,YAAA,CAPA,mBAAA,CAJA,cAAA,CACA,UAAA,CAiBA,yBAAA,CALA,mGACE,CAZF,ShC6sGF,CgC1rGE,aAtBF,QAuBI,YhC6rGF,CACF,CgC1rGE,kBACE,wBhC4rGJ,CgCxrGE,gBAEE,SAAA,CADA,mBAAA,CAGA,+BAAA,CADA,uBhC2rGJ,CgCvrGI,0BACE,8BhCyrGN,CgCprGE,4BAEE,0CAAA,CADA,+BhCurGJ,CgClrGE,YACE,oBAAA,CACA,oBhCorGJ,CiCzuGA,oBACE,GACE,mBjC4uGF,CACF,CiCpuGA,MACE,wfjCsuGF,CiChuGA,YACE,aAAA,CAEA,eAAA,CADA,ajCouGF,CiChuGE,+BAOE,kBAAA,CAAA,kBjCiuGJ,CiCxuGE,+BAOE,iBAAA,CAAA,mBjCiuGJ,CiCxuGE,qBAQE,aAAA,CACA,cAAA,CACA,YAAA,CATA,iBAAA,CAKA,UjCkuGJ,CiC3tGI,qCAIE,iBjCmuGN,CiCvuGI,qCAIE,kBjCmuGN,CiCvuGI,2BAME,6BAAA,CADA,UAAA,CAJA,oBAAA,CAEA,YAAA,CAIA,yCAAA,CAAA,iCAAA,CACA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CARA,WjCquGN,CiCxtGE,mBACE,iBAAA,CACA,UjC0tGJ,CiCttGE,kBAUE,2CAAA,CACA,mBAAA,CACA,8BAAA,CAJA,gCAAA,CACA,oBAAA,CAHA,kBAAA,CAFA,YAAA,CASA,SAAA,CANA,aAAA,CAFA,SAAA,CAJA,iBAAA,CAgBA,4BAAA,CAfA,UAAA,CAYA,+CACE,CAZF,SjCouGJ,CiCntGI,+EACE,gBAAA,CACA,SAAA,CACA,sCjCqtGN,CiC/sGI,qCAEE,oCACE,gCjCgtGN,CiC5sGI,2CACE,cjC8sGN,CACF,CiCzsGE,kBACE,kBjC2sGJ,CiCvsGE,4BAGE,kBAAA,CAAA,oBjC8sGJ,CiCjtGE,4BAGE,mBAAA,CAAA,mBjC8sGJ,CiCjtGE,kBAKE,cAAA,CAJA,aAAA,CAKA,YAAA,CAIA,uBAAA,CAHA,2CACE,CAJF,kBAAA,CAFA,UjC+sGJ,CiCpsGI,gDACE,+BjCssGN,CiClsGI,wBACE,qDjCosGN,CkC1yGA,MAEI,uWAAA,CAAA,8WAAA,CAAA,sPAAA,CAAA,8xBAAA,CAAA,0MAAA,CAAA,gbAAA,CAAA,gMAAA,CAAA,iQAAA,CAAA,0VAAA,CAAA,6aAAA,CAAA,8SAAA,CAAA,gMlCm0GJ,CkCvzGE,4CAME,8CAAA,CACA,4BAAA,CACA,mBAAA,CACA,8BAAA,CAJA,mCAAA,CAJA,iBAAA,CAGA,gBAAA,CADA,iBAAA,CADA,eAAA,CASA,uBAAA,CADA,2BlC2zGJ,CkCvzGI,aAdF,4CAeI,elC0zGJ,CACF,CkCvzGI,sEACE,gClCyzGN,CkCpzGI,gDACE,qBlCszGN,CkClzGI,gIAEE,iBAAA,CADA,clCqzGN,CkChzGI,4FACE,iBlCkzGN,CkC9yGI,kFACE,elCgzGN,CkC5yGI,0FACE,YlC8yGN,CkC1yGI,8EACE,mBlC4yGN,CkCvyGE,sEAGE,iBAAA,CAAA,mBlCizGJ,CkCpzGE,sEAGE,kBAAA,CAAA,kBlCizGJ,CkCpzGE,sEASE,uBlC2yGJ,CkCpzGE,sEASE,wBlC2yGJ,CkCpzGE,sEAUE,4BlC0yGJ,CkCpzGE,4IAWE,6BlCyyGJ,CkCpzGE,sEAWE,4BlCyyGJ,CkCpzGE,kDAOE,0BAAA,CACA,WAAA,CAFA,eAAA,CADA,eAAA,CAHA,oBAAA,CAAA,iBAAA,CADA,iBlCmzGJ,CkCtyGI,kFACE,elCwyGN,CkCpyGI,oFAOE,UlC0yGN,CkCjzGI,oFAOE,WlC0yGN,CkCjzGI,gEAME,wBhBkIU,CgBnIV,UAAA,CADA,WAAA,CAIA,kDAAA,CAAA,0CAAA,CACA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CAVA,iBAAA,CACA,UAAA,CACA,UlC8yGN,CkClyGI,4DACE,4DlCoyGN,CkCtxGE,sDACE,oBlCyxGJ,CkCtxGI,gFACE,gClCwxGN,CkCnxGE,8DACE,0BlCsxGJ,CkCnxGI,4EACE,wBAlBG,CAmBH,kDAAA,CAAA,0ClCqxGN,CkCjxGI,0EACE,alCmxGN,CkCxyGE,8DACE,oBlC2yGJ,CkCxyGI,wFACE,gClC0yGN,CkCryGE,sEACE,0BlCwyGJ,CkCryGI,oFACE,wBAlBG,CAmBH,sDAAA,CAAA,8ClCuyGN,CkCnyGI,kFACE,alCqyGN,CkC1zGE,sDACE,oBlC6zGJ,CkC1zGI,gFACE,gClC4zGN,CkCvzGE,8DACE,0BlC0zGJ,CkCvzGI,4EACE,wBAlBG,CAmBH,kDAAA,CAAA,0ClCyzGN,CkCrzGI,0EACE,alCuzGN,CkC50GE,oDACE,oBlC+0GJ,CkC50GI,8EACE,gClC80GN,CkCz0GE,4DACE,0BlC40GJ,CkCz0GI,0EACE,wBAlBG,CAmBH,iDAAA,CAAA,yClC20GN,CkCv0GI,wEACE,alCy0GN,CkC91GE,4DACE,oBlCi2GJ,CkC91GI,sFACE,gClCg2GN,CkC31GE,oEACE,0BlC81GJ,CkC31GI,kFACE,wBAlBG,CAmBH,qDAAA,CAAA,6ClC61GN,CkCz1GI,gFACE,alC21GN,CkCh3GE,8DACE,oBlCm3GJ,CkCh3GI,wFACE,gClCk3GN,CkC72GE,sEACE,0BlCg3GJ,CkC72GI,oFACE,wBAlBG,CAmBH,sDAAA,CAAA,8ClC+2GN,CkC32GI,kFACE,alC62GN,CkCl4GE,4DACE,oBlCq4GJ,CkCl4GI,sFACE,gClCo4GN,CkC/3GE,oEACE,0BlCk4GJ,CkC/3GI,kFACE,wBAlBG,CAmBH,qDAAA,CAAA,6ClCi4GN,CkC73GI,gFACE,alC+3GN,CkCp5GE,4DACE,oBlCu5GJ,CkCp5GI,sFACE,gClCs5GN,CkCj5GE,oEACE,0BlCo5GJ,CkCj5GI,kFACE,wBAlBG,CAmBH,qDAAA,CAAA,6ClCm5GN,CkC/4GI,gFACE,alCi5GN,CkCt6GE,0DACE,oBlCy6GJ,CkCt6GI,oFACE,gClCw6GN,CkCn6GE,kEACE,0BlCs6GJ,CkCn6GI,gFACE,wBAlBG,CAmBH,oDAAA,CAAA,4ClCq6GN,CkCj6GI,8EACE,alCm6GN,CkCx7GE,oDACE,oBlC27GJ,CkCx7GI,8EACE,gClC07GN,CkCr7GE,4DACE,0BlCw7GJ,CkCr7GI,0EACE,wBAlBG,CAmBH,iDAAA,CAAA,yClCu7GN,CkCn7GI,wEACE,alCq7GN,CkC18GE,4DACE,oBlC68GJ,CkC18GI,sFACE,gClC48GN,CkCv8GE,oEACE,0BlC08GJ,CkCv8GI,kFACE,wBAlBG,CAmBH,qDAAA,CAAA,6ClCy8GN,CkCr8GI,gFACE,alCu8GN,CkC59GE,wDACE,oBlC+9GJ,CkC59GI,kFACE,gClC89GN,CkCz9GE,gEACE,0BlC49GJ,CkCz9GI,8EACE,wBAlBG,CAmBH,mDAAA,CAAA,2ClC29GN,CkCv9GI,4EACE,alCy9GN,CmC7nHA,MACE,wMnCgoHF,CmCvnHE,sBAEE,uCAAA,CADA,gBnC2nHJ,CmCvnHI,mCACE,anCynHN,CmC1nHI,mCACE,cnCynHN,CmCrnHM,4BACE,sBnCunHR,CmCpnHQ,mCACE,gCnCsnHV,CmClnHQ,2DACE,SAAA,CAEA,uBAAA,CADA,enCqnHV,CmChnHQ,yGACE,SAAA,CACA,uBnCknHV,CmC9mHQ,yCACE,YnCgnHV,CmCzmHE,0BACE,eAAA,CACA,enC2mHJ,CmCxmHI,+BACE,oBnC0mHN,CmCrmHE,gDACE,YnCumHJ,CmCnmHE,8BAIE,+BAAA,CAHA,oBAAA,CAEA,WAAA,CAGA,SAAA,CAKA,4BAAA,CAJA,4DACE,CAHF,0BnCumHJ,CmC9lHI,aAdF,8BAeI,+BAAA,CACA,SAAA,CACA,uBnCimHJ,CACF,CmC9lHI,wCACE,6BnCgmHN,CmC5lHI,oCACE,+BnC8lHN,CmC1lHI,qCAKE,6BAAA,CADA,UAAA,CAHA,oBAAA,CAEA,YAAA,CAGA,2CAAA,CAAA,mCAAA,CACA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CAPA,WnCmmHN,CmCtlHQ,mDACE,oBnCwlHV,CoCtsHE,kCAEE,iBpC4sHJ,CoC9sHE,kCAEE,kBpC4sHJ,CoC9sHE,wBAGE,yCAAA,CAFA,oBAAA,CAGA,SAAA,CACA,mCpCysHJ,CoCpsHI,aAVF,wBAWI,YpCusHJ,CACF,CoCnsHE,6FAEE,SAAA,CACA,mCpCqsHJ,CoC/rHE,4FAEE,+BpCisHJ,CoC7rHE,oBACE,yBAAA,CACA,uBAAA,CAGA,yEpC6rHJ,CK9jHI,sC+BrHE,qDACE,uBpCsrHN,CACF,CoCjrHE,kEACE,yBpCmrHJ,CoC/qHE,sBACE,0BpCirHJ,CqC5uHE,2BACE,arC+uHJ,CK1jHI,0CgCtLF,2BAKI,erC+uHJ,CqC5uHI,6BACE,iBrC8uHN,CACF,CqC1uHI,6BAEE,0BAAA,CAAA,2BAAA,CADA,eAAA,CAEA,iBrC4uHN,CqCzuHM,2CACE,kBrC2uHR,CqCruHI,6CACE,QrCuuHN,CsCnwHE,uBACE,4CtCuwHJ,CsClwHE,8CAJE,kCAAA,CAAA,0BtC0wHJ,CsCtwHE,uBACE,4CtCqwHJ,CsChwHE,4BAEE,kCAAA,CAAA,0BAAA,CADA,qCtCmwHJ,CsC/vHI,mCACE,atCiwHN,CsC7vHI,kCACE,atC+vHN,CsC1vHE,0BAKE,eAAA,CAJA,aAAA,CAEA,YAAA,CACA,aAAA,CAFA,kBAAA,CAAA,mBtC+vHJ,CsCzvHI,uCACE,etC2vHN,CsCvvHI,sCACE,kBtCyvHN,CuCtyHA,MACE,8LvCyyHF,CuChyHE,oBAGE,iBAAA,CAEA,gBAAA,CADA,avCkyHJ,CuC9xHI,wCACE,uBvCgyHN,CuC5xHI,gCAEE,eAAA,CADA,gBvC+xHN,CuCxxHM,wCACE,mBvC0xHR,CuCpxHE,8BAKE,oBvCwxHJ,CuC7xHE,8BAKE,mBvCwxHJ,CuC7xHE,8BAUE,4BvCmxHJ,CuC7xHE,4DAWE,6BvCkxHJ,CuC7xHE,8BAWE,4BvCkxHJ,CuC7xHE,oBASE,cAAA,CANA,aAAA,CACA,eAAA,CAIA,evCqxHJ,CuC/wHI,kCACE,uCAAA,CACA,oBvCixHN,CuC7wHI,wCAEE,uCAAA,CADA,YvCgxHN,CuC3wHI,oCASE,WvCixHN,CuC1xHI,oCASE,UvCixHN,CuC1xHI,0BAME,6BAAA,CADA,UAAA,CADA,WAAA,CAMA,yCAAA,CAAA,iCAAA,CACA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CAZA,iBAAA,CACA,UAAA,CAMA,sBAAA,CADA,yBAAA,CAJA,UvCuxHN,CuC1wHM,oCACE,wBvC4wHR,CuCvwHI,4BACE,YvCywHN,CuCpwHI,4CACE,YvCswHN,CwCh2HE,+DACE,sBAAA,CAEA,mBAAA,CACA,0BAAA,CACA,uBxCk2HJ,CwC/1HI,2EAGE,iBAAA,CADA,eAAA,CADA,yBxCm2HN,CwC51HE,mEACE,0BxC81HJ,CwC11HE,oBACE,qBxC41HJ,CwCx1HE,gBACE,oBxC01HJ,CwCt1HE,gBACE,qBxCw1HJ,CwCp1HE,iBACE,kBxCs1HJ,CwCl1HE,kBACE,kBxCo1HJ,CyC73HE,6BACE,sCzCg4HJ,CyC73HE,cACE,yCzC+3HJ,CyCn3HE,sIACE,oCzCq3HJ,CyC72HE,2EACE,qCzC+2HJ,CyCr2HE,wGACE,oCzCu2HJ,CyC91HE,yFACE,qCzCg2HJ,CyC31HE,6BACE,kCzC61HJ,CyCv1HE,6CACE,sCzCy1HJ,CyCl1HE,4DACE,sCzCo1HJ,CyC70HE,4DACE,qCzC+0HJ,CyCt0HE,yFACE,qCzCw0HJ,CyCh0HE,2EACE,sCzCk0HJ,CyCvzHE,wHACE,qCzCyzHJ,CyCpzHE,8BAGE,mBAAA,CADA,gBAAA,CADA,gBzCwzHJ,CyCnzHE,eACE,4CzCqzHJ,CyClzHE,eACE,4CzCozHJ,CyChzHE,gBAIE,+CAAA,CACA,kDAAA,CAJA,aAAA,CAEA,wBAAA,CADA,wBzCqzHJ,CyC9yHE,yBAOE,wCAAA,CACA,+DAAA,CACA,4BAAA,CACA,6BAAA,CARA,iBAAA,CAGA,eAAA,CACA,eAAA,CAFA,cAAA,CADA,oCAAA,CAFA,iBzCyzHJ,CyC7yHI,6BACE,YzC+yHN,CyC5yHM,kCACE,wBAAA,CACA,yBzC8yHR,CyCxyHE,iCAaE,wCAAA,CACA,+DAAA,CAJA,uCAAA,CACA,0BAAA,CALA,UAAA,CAJA,oBAAA,CAOA,2BAAA,CADA,2BAAA,CADA,2BAAA,CANA,eAAA,CAWA,wBAAA,CAAA,gBAAA,CAPA,SzCizHJ,CyC/xHE,sBACE,iBAAA,CACA,iBzCiyHJ,CyC5xHE,iCAKE,ezC0xHJ,CyCvxHI,sCACE,gBzCyxHN,CyCrxHI,gDACE,YzCuxHN,CyC7wHA,gBACE,iBzCgxHF,CyC5wHE,yCACE,aAAA,CACA,SzC8wHJ,CyCzwHE,mBACE,YzC2wHJ,CyCtwHE,oBACE,QzCwwHJ,CyCpwHE,4BACE,WAAA,CACA,SAAA,CACA,ezCswHJ,CyCnwHI,0CACE,YzCqwHN,CyC/vHE,yBAKE,wCAAA,CAEA,+BAAA,CADA,4BAAA,CAHA,eAAA,CADA,oDAAA,CAEA,wBAAA,CAAA,gBzCowHJ,CyC7vHE,2BAEE,+DAAA,CADA,2BzCgwHJ,CyC5vHI,+BACE,uCAAA,CACA,gBzC8vHN,CyCzvHE,sBACE,MAAA,CACA,WzC2vHJ,CyCtvHA,aACE,azCyvHF,CyC/uHE,4BAEE,aAAA,CADA,YzCmvHJ,CyC/uHI,wDAEE,2BAAA,CADA,wBzCkvHN,CyC5uHE,+BAKE,2CAAA,CAEA,+BAAA,CADA,gCAAA,CADA,sBAAA,CAHA,mBAAA,CACA,gBAAA,CAFA,azCovHJ,CyC3uHI,qCAEE,UAAA,CACA,UAAA,CAFA,azC+uHN,CKt3HI,0CoCsJF,8BACE,iBzCouHF,CyC1tHE,wSAGE,ezCguHJ,CyC5tHE,sCAEE,mBAAA,CACA,eAAA,CADA,oBAAA,CADA,kBAAA,CAAA,mBzCguHJ,CACF,C0C7jII,yDAIE,+BAAA,CACA,8BAAA,CAFA,aAAA,CADA,QAAA,CADA,iB1CmkIN,C0C3jII,uBAEE,uCAAA,CADA,c1C8jIN,C0CzgIM,iHAEE,WAlDkB,CAiDlB,kB1CohIR,C0CrhIM,6HAEE,WAlDkB,CAiDlB,kB1CgiIR,C0CjiIM,6HAEE,WAlDkB,CAiDlB,kB1C4iIR,C0C7iIM,oHAEE,WAlDkB,CAiDlB,kB1CwjIR,C0CzjIM,0HAEE,WAlDkB,CAiDlB,kB1CokIR,C0CrkIM,uHAEE,WAlDkB,CAiDlB,kB1CglIR,C0CjlIM,uHAEE,WAlDkB,CAiDlB,kB1C4lIR,C0C7lIM,6HAEE,WAlDkB,CAiDlB,kB1CwmIR,C0CzmIM,yCAEE,WAlDkB,CAiDlB,kB1C4mIR,C0C7mIM,yCAEE,WAlDkB,CAiDlB,kB1CgnIR,C0CjnIM,0CAEE,WAlDkB,CAiDlB,kB1ConIR,C0CrnIM,uCAEE,WAlDkB,CAiDlB,kB1CwnIR,C0CznIM,wCAEE,WAlDkB,CAiDlB,kB1C4nIR,C0C7nIM,sCAEE,WAlDkB,CAiDlB,kB1CgoIR,C0CjoIM,wCAEE,WAlDkB,CAiDlB,kB1CooIR,C0CroIM,oCAEE,WAlDkB,CAiDlB,kB1CwoIR,C0CzoIM,2CAEE,WAlDkB,CAiDlB,kB1C4oIR,C0C7oIM,qCAEE,WAlDkB,CAiDlB,kB1CgpIR,C0CjpIM,oCAEE,WAlDkB,CAiDlB,kB1CopIR,C0CrpIM,kCAEE,WAlDkB,CAiDlB,kB1CwpIR,C0CzpIM,qCAEE,WAlDkB,CAiDlB,kB1C4pIR,C0C7pIM,mCAEE,WAlDkB,CAiDlB,kB1CgqIR,C0CjqIM,qCAEE,WAlDkB,CAiDlB,kB1CoqIR,C0CrqIM,wCAEE,WAlDkB,CAiDlB,kB1CwqIR,C0CzqIM,sCAEE,WAlDkB,CAiDlB,kB1C4qIR,C0C7qIM,2CAEE,WAlDkB,CAiDlB,kB1CgrIR,C0CrqIM,iCAEE,WAPkB,CAMlB,iB1CwqIR,C0CzqIM,uCAEE,WAPkB,CAMlB,iB1C4qIR,C0C7qIM,mCAEE,WAPkB,CAMlB,iB1CgrIR,C2ClwIA,MACE,qMAAA,CACA,mM3CqwIF,C2C5vIE,wBAKE,mBAAA,CAHA,YAAA,CACA,qBAAA,CACA,YAAA,CAHA,iB3CmwIJ,C2CzvII,8BAGE,QAAA,CACA,SAAA,CAHA,iBAAA,CACA,O3C6vIN,C2CxvIM,qCACE,0B3C0vIR,C2C7tIM,kEACE,0C3C+tIR,C2CztIE,2BAKE,uBAAA,CADA,+DAAA,CAHA,YAAA,CACA,cAAA,CACA,aAAA,CAGA,oB3C2tIJ,C2CxtII,aATF,2BAUI,gB3C2tIJ,CACF,C2CxtII,cAGE,+BACE,iB3CwtIN,C2CrtIM,sCAQE,qCAAA,CANA,QAAA,CAKA,UAAA,CAHA,aAAA,CAEA,UAAA,CAHA,MAAA,CAFA,iBAAA,CAaA,2CAAA,CALA,2DACE,CAGF,kDAAA,CARA,+B3C6tIR,CACF,C2C/sII,8CACE,Y3CitIN,C2C7sII,iCASE,+BAAA,CACA,6BAAA,CAJA,uCAAA,CAEA,cAAA,CAPA,aAAA,CAGA,gBAAA,CACA,eAAA,CAFA,8BAAA,CAWA,+BAAA,CAHA,2CACE,CALF,kBAAA,CALA,U3CytIN,C2C1sIM,aAII,6CACE,O3CysIV,C2C1sIQ,8CACE,O3C4sIV,C2C7sIQ,8CACE,O3C+sIV,C2ChtIQ,8CACE,O3CktIV,C2CntIQ,8CACE,O3CqtIV,C2CttIQ,8CACE,O3CwtIV,C2CztIQ,8CACE,O3C2tIV,C2C5tIQ,8CACE,O3C8tIV,C2C/tIQ,8CACE,O3CiuIV,C2CluIQ,+CACE,Q3CouIV,C2CruIQ,+CACE,Q3CuuIV,C2CxuIQ,+CACE,Q3C0uIV,C2C3uIQ,+CACE,Q3C6uIV,C2C9uIQ,+CACE,Q3CgvIV,C2CjvIQ,+CACE,Q3CmvIV,C2CpvIQ,+CACE,Q3CsvIV,C2CvvIQ,+CACE,Q3CyvIV,C2C1vIQ,+CACE,Q3C4vIV,C2C7vIQ,+CACE,Q3C+vIV,C2ChwIQ,+CACE,Q3CkwIV,CACF,C2C7vIM,uCACE,gC3C+vIR,C2C3vIM,oDACE,a3C6vIR,C2CxvII,yCACE,S3C0vIN,C2CtvIM,2CACE,aAAA,CACA,8B3CwvIR,C2ClvIE,4BACE,U3CovIJ,C2CjvII,aAJF,4BAKI,gB3CovIJ,CACF,C2ChvIE,0BACE,Y3CkvIJ,C2C/uII,aAJF,0BAKI,a3CkvIJ,C2C9uIM,sCACE,O3CgvIR,C2CjvIM,uCACE,O3CmvIR,C2CpvIM,uCACE,O3CsvIR,C2CvvIM,uCACE,O3CyvIR,C2C1vIM,uCACE,O3C4vIR,C2C7vIM,uCACE,O3C+vIR,C2ChwIM,uCACE,O3CkwIR,C2CnwIM,uCACE,O3CqwIR,C2CtwIM,uCACE,O3CwwIR,C2CzwIM,wCACE,Q3C2wIR,C2C5wIM,wCACE,Q3C8wIR,C2C/wIM,wCACE,Q3CixIR,C2ClxIM,wCACE,Q3CoxIR,C2CrxIM,wCACE,Q3CuxIR,C2CxxIM,wCACE,Q3C0xIR,C2C3xIM,wCACE,Q3C6xIR,C2C9xIM,wCACE,Q3CgyIR,C2CjyIM,wCACE,Q3CmyIR,C2CpyIM,wCACE,Q3CsyIR,C2CvyIM,wCACE,Q3CyyIR,CACF,C2CnyII,+FAEE,Q3CqyIN,C2ClyIM,yGACE,wBAAA,CACA,yB3CqyIR,C2C5xIM,2DAEE,wBAAA,CACA,yBAAA,CAFA,Q3CgyIR,C2CzxIM,iEACE,Q3C2xIR,C2CxxIQ,qLAGE,wBAAA,CACA,yBAAA,CAFA,Q3C4xIV,C2CtxIQ,6FACE,wBAAA,CACA,yB3CwxIV,C2CnxIM,yDACE,kB3CqxIR,C2ChxII,sCACE,Q3CkxIN,C2C7wIE,2BAEE,iBAAA,CAOA,kBAAA,CAHA,uCAAA,CAEA,cAAA,CAPA,aAAA,CAGA,YAAA,CACA,gBAAA,CAEA,mBAAA,CAGA,gCAAA,CAPA,W3CsxIJ,C2C5wII,iCAEE,uDAAA,CADA,+B3C+wIN,C2C1wII,iCAKE,6BAAA,CADA,UAAA,CAHA,aAAA,CAEA,WAAA,CAMA,8CAAA,CAAA,sCAAA,CACA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CANA,+CACE,CALF,U3CoxIN,C2CrwIE,4BAOE,yEACE,CANF,YAAA,CAGA,aAAA,CAFA,qBAAA,CAGA,mBAAA,CALA,iBAAA,CAYA,wBAAA,CATA,Y3C2wIJ,C2C/vII,sCACE,wB3CiwIN,C2C7vII,oCACE,S3C+vIN,C2C3vII,kCAGE,wEACE,CAFF,mBAAA,CADA,O3C+vIN,C2CrvIM,uDACE,8CAAA,CAAA,sC3CuvIR,CK93II,0CsCqJF,wDAEE,kB3C+uIF,C2CjvIA,wDAEE,mB3C+uIF,C2CjvIA,8CAGE,eAAA,CAFA,eAAA,CAGA,iC3C6uIF,C2CzuIE,8DACE,mB3C4uIJ,C2C7uIE,8DACE,kB3C4uIJ,C2C7uIE,oDAEE,U3C2uIJ,C2CvuIE,8EAEE,kB3C0uIJ,C2C5uIE,8EAEE,mB3C0uIJ,C2C5uIE,8EAGE,kB3CyuIJ,C2C5uIE,8EAGE,mB3CyuIJ,C2C5uIE,oEACE,U3C2uIJ,C2CruIE,8EAEE,mB3CwuIJ,C2C1uIE,8EAEE,kB3CwuIJ,C2C1uIE,8EAGE,mB3CuuIJ,C2C1uIE,8EAGE,kB3CuuIJ,C2C1uIE,oEACE,U3CyuIJ,CACF,C2C3tIE,cAHF,olDAII,gC3C8tIF,C2C3tIE,g8GACE,uC3C6tIJ,CACF,C2CxtIA,4sDACE,+B3C2tIF,C2CvtIA,wmDACE,a3C0tIF,C4C9lJA,MACE,8WAAA,CACA,uX5CimJF,C4CxlJE,4BAEE,oBAAA,CADA,iB5C4lJJ,C4CvlJI,sDAGE,S5CylJN,C4C5lJI,sDAGE,U5CylJN,C4C5lJI,4CACE,iBAAA,CACA,S5C0lJN,C4CplJE,+CAEE,SAAA,CADA,U5CulJJ,C4CllJE,kDAOE,W5CwlJJ,C4C/lJE,kDAOE,Y5CwlJJ,C4C/lJE,wCAME,qDAAA,CADA,UAAA,CADA,aAAA,CAIA,0CAAA,CAAA,kCAAA,CACA,4BAAA,CAAA,oBAAA,CACA,6BAAA,CAAA,qBAAA,CACA,yBAAA,CAAA,iBAAA,CAVA,iBAAA,CACA,SAAA,CACA,Y5C4lJJ,C4ChlJE,gEACE,wB1B2Wa,C0B1Wb,mDAAA,CAAA,2C5CklJJ,C6CloJA,QACE,8DAAA,CAGA,+CAAA,CACA,iEAAA,CACA,oDAAA,CACA,sDAAA,CACA,mDAAA,CAGA,qEAAA,CACA,qEAAA,CACA,wEAAA,CACA,0EAAA,CACA,wEAAA,CACA,yEAAA,CACA,kEAAA,CACA,+DAAA,CACA,oEAAA,CACA,oEAAA,CACA,mEAAA,CACA,gEAAA,CACA,uEAAA,CACA,mEAAA,CACA,qEAAA,CACA,oEAAA,CACA,gEAAA,CACA,wEAAA,CACA,qEAAA,CACA,+D7CioJF,C6C3nJA,SAEE,kBAAA,CADA,Y7C+nJF,C8CjqJE,kBAUE,cAAA,CATA,YAAA,CACA,kEACE,CAQF,Y9C6pJJ,C8CzpJI,sDACE,gB9C2pJN,C8CrpJI,oFAKE,wDAAA,CACA,mBAAA,CAJA,aAAA,CAEA,QAAA,CADA,aAAA,CAIA,sC9CupJN,C8ClpJM,iOACE,kBAAA,CACA,8B9CqpJR,C8CjpJM,6FACE,iBAAA,CAAA,c9CopJR,C8ChpJM,2HACE,Y9CmpJR,C8C/oJM,wHACE,e9CkpJR,C8CnoJI,yMAGE,eAAA,CAAA,Y9C2oJN,C8C7nJI,ybAOE,W9CmoJN,C8C/nJI,8BACE,eAAA,CAAA,Y9CioJN,CK7jJI,mC0ChKA,8BACE,U/CquJJ,C+CtuJE,8BACE,W/CquJJ,C+CtuJE,8BAGE,kB/CmuJJ,C+CtuJE,8BAGE,iB/CmuJJ,C+CtuJE,oBAKE,mBAAA,CADA,YAAA,CAFA,a/CouJJ,C+C9tJI,kCACE,W/CiuJN,C+CluJI,kCACE,U/CiuJN,C+CluJI,kCAEE,iBAAA,CAAA,c/CguJN,C+CluJI,kCAEE,aAAA,CAAA,kB/CguJN,CACF","file":"main.css"}
\ No newline at end of file
diff --git a/assets/stylesheets/palette.06af60db.min.css b/assets/stylesheets/palette.06af60db.min.css
new file mode 100644
index 0000000..a640d38
--- /dev/null
+++ b/assets/stylesheets/palette.06af60db.min.css
@@ -0,0 +1 @@
+@media screen{[data-md-color-scheme=slate]{--md-default-fg-color:hsla(var(--md-hue),15%,90%,0.82);--md-default-fg-color--light:hsla(var(--md-hue),15%,90%,0.56);--md-default-fg-color--lighter:hsla(var(--md-hue),15%,90%,0.32);--md-default-fg-color--lightest:hsla(var(--md-hue),15%,90%,0.12);--md-default-bg-color:hsla(var(--md-hue),15%,14%,1);--md-default-bg-color--light:hsla(var(--md-hue),15%,14%,0.54);--md-default-bg-color--lighter:hsla(var(--md-hue),15%,14%,0.26);--md-default-bg-color--lightest:hsla(var(--md-hue),15%,14%,0.07);--md-code-fg-color:hsla(var(--md-hue),18%,86%,0.82);--md-code-bg-color:hsla(var(--md-hue),15%,18%,1);--md-code-hl-color:#2977ff;--md-code-hl-color--light:#2977ff1a;--md-code-hl-number-color:#e6695b;--md-code-hl-special-color:#f06090;--md-code-hl-function-color:#c973d9;--md-code-hl-constant-color:#9383e2;--md-code-hl-keyword-color:#6791e0;--md-code-hl-string-color:#2fb170;--md-code-hl-name-color:var(--md-code-fg-color);--md-code-hl-operator-color:var(--md-default-fg-color--light);--md-code-hl-punctuation-color:var(--md-default-fg-color--light);--md-code-hl-comment-color:var(--md-default-fg-color--light);--md-code-hl-generic-color:var(--md-default-fg-color--light);--md-code-hl-variable-color:var(--md-default-fg-color--light);--md-typeset-color:var(--md-default-fg-color);--md-typeset-a-color:var(--md-primary-fg-color);--md-typeset-kbd-color:hsla(var(--md-hue),15%,90%,0.12);--md-typeset-kbd-accent-color:hsla(var(--md-hue),15%,90%,0.2);--md-typeset-kbd-border-color:hsla(var(--md-hue),15%,14%,1);--md-typeset-mark-color:#4287ff4d;--md-typeset-table-color:hsla(var(--md-hue),15%,95%,0.12);--md-typeset-table-color--light:hsla(var(--md-hue),15%,95%,0.035);--md-admonition-fg-color:var(--md-default-fg-color);--md-admonition-bg-color:var(--md-default-bg-color);--md-footer-bg-color:hsla(var(--md-hue),15%,10%,0.87);--md-footer-bg-color--dark:hsla(var(--md-hue),15%,8%,1);--md-shadow-z1:0 0.2rem 0.5rem #0000000d,0 0 0.05rem #0000001a;--md-shadow-z2:0 0.2rem 0.5rem #00000040,0 0 0.05rem #00000040;--md-shadow-z3:0 0.2rem 0.5rem #0006,0 0 0.05rem #00000059;color-scheme:dark}[data-md-color-scheme=slate] img[src$="#gh-light-mode-only"],[data-md-color-scheme=slate] img[src$="#only-light"]{display:none}[data-md-color-scheme=slate][data-md-color-primary=pink]{--md-typeset-a-color:#ed5487}[data-md-color-scheme=slate][data-md-color-primary=purple]{--md-typeset-a-color:#c46fd3}[data-md-color-scheme=slate][data-md-color-primary=deep-purple]{--md-typeset-a-color:#a47bea}[data-md-color-scheme=slate][data-md-color-primary=indigo]{--md-typeset-a-color:#5488e8}[data-md-color-scheme=slate][data-md-color-primary=teal]{--md-typeset-a-color:#00ccb8}[data-md-color-scheme=slate][data-md-color-primary=green]{--md-typeset-a-color:#71c174}[data-md-color-scheme=slate][data-md-color-primary=deep-orange]{--md-typeset-a-color:#ff764d}[data-md-color-scheme=slate][data-md-color-primary=brown]{--md-typeset-a-color:#c1775c}[data-md-color-scheme=slate][data-md-color-primary=black],[data-md-color-scheme=slate][data-md-color-primary=blue-grey],[data-md-color-scheme=slate][data-md-color-primary=grey],[data-md-color-scheme=slate][data-md-color-primary=white]{--md-typeset-a-color:#5e8bde}[data-md-color-switching] *,[data-md-color-switching] :after,[data-md-color-switching] :before{transition-duration:0ms!important}}[data-md-color-accent=red]{--md-accent-fg-color:#ff1947;--md-accent-fg-color--transparent:#ff19471a;--md-accent-bg-color:#fff;--md-accent-bg-color--light:#ffffffb3}[data-md-color-accent=pink]{--md-accent-fg-color:#f50056;--md-accent-fg-color--transparent:#f500561a;--md-accent-bg-color:#fff;--md-accent-bg-color--light:#ffffffb3}[data-md-color-accent=purple]{--md-accent-fg-color:#df41fb;--md-accent-fg-color--transparent:#df41fb1a;--md-accent-bg-color:#fff;--md-accent-bg-color--light:#ffffffb3}[data-md-color-accent=deep-purple]{--md-accent-fg-color:#7c4dff;--md-accent-fg-color--transparent:#7c4dff1a;--md-accent-bg-color:#fff;--md-accent-bg-color--light:#ffffffb3}[data-md-color-accent=indigo]{--md-accent-fg-color:#526cfe;--md-accent-fg-color--transparent:#526cfe1a;--md-accent-bg-color:#fff;--md-accent-bg-color--light:#ffffffb3}[data-md-color-accent=blue]{--md-accent-fg-color:#4287ff;--md-accent-fg-color--transparent:#4287ff1a;--md-accent-bg-color:#fff;--md-accent-bg-color--light:#ffffffb3}[data-md-color-accent=light-blue]{--md-accent-fg-color:#0091eb;--md-accent-fg-color--transparent:#0091eb1a;--md-accent-bg-color:#fff;--md-accent-bg-color--light:#ffffffb3}[data-md-color-accent=cyan]{--md-accent-fg-color:#00bad6;--md-accent-fg-color--transparent:#00bad61a;--md-accent-bg-color:#fff;--md-accent-bg-color--light:#ffffffb3}[data-md-color-accent=teal]{--md-accent-fg-color:#00bda4;--md-accent-fg-color--transparent:#00bda41a;--md-accent-bg-color:#fff;--md-accent-bg-color--light:#ffffffb3}[data-md-color-accent=green]{--md-accent-fg-color:#00c753;--md-accent-fg-color--transparent:#00c7531a;--md-accent-bg-color:#fff;--md-accent-bg-color--light:#ffffffb3}[data-md-color-accent=light-green]{--md-accent-fg-color:#63de17;--md-accent-fg-color--transparent:#63de171a;--md-accent-bg-color:#fff;--md-accent-bg-color--light:#ffffffb3}[data-md-color-accent=lime]{--md-accent-fg-color:#b0eb00;--md-accent-fg-color--transparent:#b0eb001a;--md-accent-bg-color:#000000de;--md-accent-bg-color--light:#0000008a}[data-md-color-accent=yellow]{--md-accent-fg-color:#ffd500;--md-accent-fg-color--transparent:#ffd5001a;--md-accent-bg-color:#000000de;--md-accent-bg-color--light:#0000008a}[data-md-color-accent=amber]{--md-accent-fg-color:#fa0;--md-accent-fg-color--transparent:#ffaa001a;--md-accent-bg-color:#000000de;--md-accent-bg-color--light:#0000008a}[data-md-color-accent=orange]{--md-accent-fg-color:#ff9100;--md-accent-fg-color--transparent:#ff91001a;--md-accent-bg-color:#000000de;--md-accent-bg-color--light:#0000008a}[data-md-color-accent=deep-orange]{--md-accent-fg-color:#ff6e42;--md-accent-fg-color--transparent:#ff6e421a;--md-accent-bg-color:#fff;--md-accent-bg-color--light:#ffffffb3}[data-md-color-primary=red]{--md-primary-fg-color:#ef5552;--md-primary-fg-color--light:#e57171;--md-primary-fg-color--dark:#e53734;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3}[data-md-color-primary=pink]{--md-primary-fg-color:#e92063;--md-primary-fg-color--light:#ec417a;--md-primary-fg-color--dark:#c3185d;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3}[data-md-color-primary=purple]{--md-primary-fg-color:#ab47bd;--md-primary-fg-color--light:#bb69c9;--md-primary-fg-color--dark:#8c24a8;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3}[data-md-color-primary=deep-purple]{--md-primary-fg-color:#7e56c2;--md-primary-fg-color--light:#9574cd;--md-primary-fg-color--dark:#673ab6;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3}[data-md-color-primary=indigo]{--md-primary-fg-color:#4051b5;--md-primary-fg-color--light:#5d6cc0;--md-primary-fg-color--dark:#303fa1;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3}[data-md-color-primary=blue]{--md-primary-fg-color:#2094f3;--md-primary-fg-color--light:#42a5f5;--md-primary-fg-color--dark:#1975d2;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3}[data-md-color-primary=light-blue]{--md-primary-fg-color:#02a6f2;--md-primary-fg-color--light:#28b5f6;--md-primary-fg-color--dark:#0287cf;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3}[data-md-color-primary=cyan]{--md-primary-fg-color:#00bdd6;--md-primary-fg-color--light:#25c5da;--md-primary-fg-color--dark:#0097a8;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3}[data-md-color-primary=teal]{--md-primary-fg-color:#009485;--md-primary-fg-color--light:#26a699;--md-primary-fg-color--dark:#007a6c;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3}[data-md-color-primary=green]{--md-primary-fg-color:#4cae4f;--md-primary-fg-color--light:#68bb6c;--md-primary-fg-color--dark:#398e3d;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3}[data-md-color-primary=light-green]{--md-primary-fg-color:#8bc34b;--md-primary-fg-color--light:#9ccc66;--md-primary-fg-color--dark:#689f38;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3}[data-md-color-primary=lime]{--md-primary-fg-color:#cbdc38;--md-primary-fg-color--light:#d3e156;--md-primary-fg-color--dark:#b0b52c;--md-primary-bg-color:#000000de;--md-primary-bg-color--light:#0000008a}[data-md-color-primary=yellow]{--md-primary-fg-color:#ffec3d;--md-primary-fg-color--light:#ffee57;--md-primary-fg-color--dark:#fbc02d;--md-primary-bg-color:#000000de;--md-primary-bg-color--light:#0000008a}[data-md-color-primary=amber]{--md-primary-fg-color:#ffc105;--md-primary-fg-color--light:#ffc929;--md-primary-fg-color--dark:#ffa200;--md-primary-bg-color:#000000de;--md-primary-bg-color--light:#0000008a}[data-md-color-primary=orange]{--md-primary-fg-color:#ffa724;--md-primary-fg-color--light:#ffa724;--md-primary-fg-color--dark:#fa8900;--md-primary-bg-color:#000000de;--md-primary-bg-color--light:#0000008a}[data-md-color-primary=deep-orange]{--md-primary-fg-color:#ff6e42;--md-primary-fg-color--light:#ff8a66;--md-primary-fg-color--dark:#f4511f;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3}[data-md-color-primary=brown]{--md-primary-fg-color:#795649;--md-primary-fg-color--light:#8d6e62;--md-primary-fg-color--dark:#5d4037;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3}[data-md-color-primary=grey]{--md-primary-fg-color:#757575;--md-primary-fg-color--light:#9e9e9e;--md-primary-fg-color--dark:#616161;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3;--md-typeset-a-color:#4051b5}[data-md-color-primary=blue-grey]{--md-primary-fg-color:#546d78;--md-primary-fg-color--light:#607c8a;--md-primary-fg-color--dark:#455a63;--md-primary-bg-color:#fff;--md-primary-bg-color--light:#ffffffb3;--md-typeset-a-color:#4051b5}[data-md-color-primary=light-green]:not([data-md-color-scheme=slate]){--md-typeset-a-color:#72ad2e}[data-md-color-primary=lime]:not([data-md-color-scheme=slate]){--md-typeset-a-color:#8b990a}[data-md-color-primary=yellow]:not([data-md-color-scheme=slate]){--md-typeset-a-color:#b8a500}[data-md-color-primary=amber]:not([data-md-color-scheme=slate]){--md-typeset-a-color:#d19d00}[data-md-color-primary=orange]:not([data-md-color-scheme=slate]){--md-typeset-a-color:#e68a00}[data-md-color-primary=white]{--md-primary-fg-color:hsla(var(--md-hue),0%,100%,1);--md-primary-fg-color--light:hsla(var(--md-hue),0%,100%,0.7);--md-primary-fg-color--dark:hsla(var(--md-hue),0%,0%,0.07);--md-primary-bg-color:hsla(var(--md-hue),0%,0%,0.87);--md-primary-bg-color--light:hsla(var(--md-hue),0%,0%,0.54);--md-typeset-a-color:#4051b5}[data-md-color-primary=white] .md-button{color:var(--md-typeset-a-color)}[data-md-color-primary=white] .md-button--primary{background-color:var(--md-typeset-a-color);border-color:var(--md-typeset-a-color);color:hsla(var(--md-hue),0%,100%,1)}@media screen and (min-width:60em){[data-md-color-primary=white] .md-search__form{background-color:hsla(var(--md-hue),0%,0%,.07)}[data-md-color-primary=white] .md-search__form:hover{background-color:hsla(var(--md-hue),0%,0%,.32)}[data-md-color-primary=white] .md-search__input+.md-search__icon{color:hsla(var(--md-hue),0%,0%,.87)}}@media screen and (min-width:76.25em){[data-md-color-primary=white] .md-tabs{border-bottom:.05rem solid #00000012}}[data-md-color-primary=black]{--md-primary-fg-color:hsla(var(--md-hue),15%,9%,1);--md-primary-fg-color--light:hsla(var(--md-hue),15%,9%,0.54);--md-primary-fg-color--dark:hsla(var(--md-hue),15%,9%,1);--md-primary-bg-color:hsla(var(--md-hue),15%,100%,1);--md-primary-bg-color--light:hsla(var(--md-hue),15%,100%,0.7);--md-typeset-a-color:#4051b5}[data-md-color-primary=black] .md-button{color:var(--md-typeset-a-color)}[data-md-color-primary=black] .md-button--primary{background-color:var(--md-typeset-a-color);border-color:var(--md-typeset-a-color);color:hsla(var(--md-hue),0%,100%,1)}[data-md-color-primary=black] .md-header{background-color:hsla(var(--md-hue),15%,9%,1)}@media screen and (max-width:59.984375em){[data-md-color-primary=black] .md-nav__source{background-color:hsla(var(--md-hue),15%,11%,.87)}}@media screen and (max-width:76.234375em){html [data-md-color-primary=black] .md-nav--primary .md-nav__title[for=__drawer]{background-color:hsla(var(--md-hue),15%,9%,1)}}@media screen and (min-width:76.25em){[data-md-color-primary=black] .md-tabs{background-color:hsla(var(--md-hue),15%,9%,1)}}
\ No newline at end of file
diff --git a/assets/stylesheets/palette.06af60db.min.css.map b/assets/stylesheets/palette.06af60db.min.css.map
new file mode 100644
index 0000000..efb568c
--- /dev/null
+++ b/assets/stylesheets/palette.06af60db.min.css.map
@@ -0,0 +1 @@
+{"version":3,"sources":["src/templates/assets/stylesheets/palette/_scheme.scss","../../../../src/templates/assets/stylesheets/palette.scss","src/templates/assets/stylesheets/palette/_accent.scss","src/templates/assets/stylesheets/palette/_primary.scss","src/templates/assets/stylesheets/utilities/_break.scss"],"names":[],"mappings":"AA2BA,cAGE,6BAME,sDAAA,CACA,6DAAA,CACA,+DAAA,CACA,gEAAA,CACA,mDAAA,CACA,6DAAA,CACA,+DAAA,CACA,gEAAA,CAGA,mDAAA,CACA,gDAAA,CAGA,0BAAA,CACA,mCAAA,CAGA,iCAAA,CACA,kCAAA,CACA,mCAAA,CACA,mCAAA,CACA,kCAAA,CACA,iCAAA,CACA,+CAAA,CACA,6DAAA,CACA,gEAAA,CACA,4DAAA,CACA,4DAAA,CACA,6DAAA,CAGA,6CAAA,CAGA,+CAAA,CAGA,uDAAA,CACA,6DAAA,CACA,2DAAA,CAGA,iCAAA,CAGA,yDAAA,CACA,iEAAA,CAGA,mDAAA,CACA,mDAAA,CAGA,qDAAA,CACA,uDAAA,CAGA,8DAAA,CAKA,8DAAA,CAKA,0DAAA,CAvEA,iBCeF,CD6DE,kHAEE,YC3DJ,CDkFE,yDACE,4BChFJ,CD+EE,2DACE,4BC7EJ,CD4EE,gEACE,4BC1EJ,CDyEE,2DACE,4BCvEJ,CDsEE,yDACE,4BCpEJ,CDmEE,0DACE,4BCjEJ,CDgEE,gEACE,4BC9DJ,CD6DE,0DACE,4BC3DJ,CD0DE,2OACE,4BC/CJ,CDsDA,+FAGE,iCCpDF,CACF,CC/CE,2BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD2CN,CCrDE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDkDN,CC5DE,8BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDyDN,CCnEE,mCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDgEN,CC1EE,8BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDuEN,CCjFE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD8EN,CCxFE,kCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDqFN,CC/FE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD4FN,CCtGE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDmGN,CC7GE,6BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD0GN,CCpHE,mCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDiHN,CC3HE,4BACE,4BAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCD2HN,CClIE,8BACE,4BAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCDkIN,CCzIE,6BACE,yBAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCDyIN,CChJE,8BACE,4BAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCDgJN,CCvJE,mCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDoJN,CEzJE,4BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFsJN,CEjKE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCF8JN,CEzKE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFsKN,CEjLE,oCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCF8KN,CEzLE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFsLN,CEjME,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCF8LN,CEzME,mCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFsMN,CEjNE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCF8MN,CEzNE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFsNN,CEjOE,8BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCF8NN,CEzOE,oCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFsON,CEjPE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCFiPN,CEzPE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCFyPN,CEjQE,8BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCFiQN,CEzQE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCFyQN,CEjRE,oCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCF8QN,CEzRE,8BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFsRN,CEjSE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCAAA,CAKA,4BF0RN,CE1SE,kCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCAAA,CAKA,4BFmSN,CEpRE,sEACE,4BFuRJ,CExRE,+DACE,4BF2RJ,CE5RE,iEACE,4BF+RJ,CEhSE,gEACE,4BFmSJ,CEpSE,iEACE,4BFuSJ,CE9RA,8BACE,mDAAA,CACA,4DAAA,CACA,0DAAA,CACA,oDAAA,CACA,2DAAA,CAGA,4BF+RF,CE5RE,yCACE,+BF8RJ,CE3RI,kDAEE,0CAAA,CACA,sCAAA,CAFA,mCF+RN,CG3MI,mCD1EA,+CACE,8CFwRJ,CErRI,qDACE,8CFuRN,CElRE,iEACE,mCFoRJ,CACF,CGtNI,sCDvDA,uCACE,oCFgRJ,CACF,CEvQA,8BACE,kDAAA,CACA,4DAAA,CACA,wDAAA,CACA,oDAAA,CACA,6DAAA,CAGA,4BFwQF,CErQE,yCACE,+BFuQJ,CEpQI,kDAEE,0CAAA,CACA,sCAAA,CAFA,mCFwQN,CEjQE,yCACE,6CFmQJ,CG5NI,0CDhCA,8CACE,gDF+PJ,CACF,CGjOI,0CDvBA,iFACE,6CF2PJ,CACF,CGzPI,sCDKA,uCACE,6CFuPJ,CACF","file":"palette.css"}
\ No newline at end of file
diff --git a/compose-train-validation-data/index.html b/compose-train-validation-data/index.html
new file mode 100644
index 0000000..18d9eaa
--- /dev/null
+++ b/compose-train-validation-data/index.html
@@ -0,0 +1,632 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/compose-train-validation-data/">
+      
+      
+        <link rel="prev" href="../add-your-own-data/">
+      
+      
+        <link rel="next" href="../related-work/">
+      
+      
+      <link rel="icon" href="../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Compose training and validation dataset - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#compose-dataset" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href=".." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Compose training and validation dataset
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href=".." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href=".." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../datasets/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+        
+      
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="compose-dataset">Compose dataset</h1>
+<p>The pipeline step that produces the final training or validation set is the <code>compose</code> step.
+Before you run this command, you should specify in the <a href="../config-files/">config</a> files what datasets should be selected and how they should be sampled.</p>
+<div class="language-bash highlight"><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a>llm-datasets<span class="w"> </span>compose<span class="w"> </span>–-split<span class="o">=</span>train<span class="w"> </span>–-configs<span class="o">=</span>my_dataset.yaml<span class="w"> </span><span class="se">\</span>
+</span><span id="__span-0-2"><a id="__codelineno-0-2" name="__codelineno-0-2" href="#__codelineno-0-2"></a><span class="w">    </span>--text_data_dir<span class="o">=</span>/data/my_text_data<span class="w"> </span><span class="se">\</span>
+</span><span id="__span-0-3"><a id="__codelineno-0-3" name="__codelineno-0-3" href="#__codelineno-0-3"></a><span class="w">    </span>--composed_data_dir<span class="o">=</span>/data/my_composed_data/train/
+</span></code></pre></div>
+<p>Depending on the your system (especially IO-speed) and dataset size this step can take a substantial amount of time (&gt; 24 hours for a 1T token dataset).</p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "..", "features": [], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/config-files/index.html b/config-files/index.html
new file mode 100644
index 0000000..3ad6b2f
--- /dev/null
+++ b/config-files/index.html
@@ -0,0 +1,744 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/config-files/">
+      
+      
+        <link rel="prev" href="../datasets/">
+      
+      
+        <link rel="next" href="../extract-text-data/">
+      
+      
+      <link rel="icon" href="../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Config files - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#config-files" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href=".." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Config files
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href=".." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href=".." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../datasets/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+        
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#specifing-local-paths" class="md-nav__link">
+    <span class="md-ellipsis">
+      Specifing local paths
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#dataset-selection-and-sampling" class="md-nav__link">
+    <span class="md-ellipsis">
+      Dataset selection and sampling
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#specifing-local-paths" class="md-nav__link">
+    <span class="md-ellipsis">
+      Specifing local paths
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#dataset-selection-and-sampling" class="md-nav__link">
+    <span class="md-ellipsis">
+      Dataset selection and sampling
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="config-files">Config Files</h1>
+<p><code>llm-datasets</code> allows you to specific general settings through config files so you do not need to specific always the same command line arguments.
+Several commands support passing the <code>--configs</code> argument which should point to one or more YAML-files on your file system. For example, the text extraction command:</p>
+<div class="language-bash highlight"><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a>llm-datasets<span class="w"> </span>extract_text<span class="w"> </span>...<span class="w"> </span>--configs<span class="w"> </span><span class="nv">$PATH_TO_YAML_CONFIG_FILE</span>
+</span></code></pre></div>
+<h2 id="specifing-local-paths">Specifing local paths</h2>
+<p>In the config files, you can store for example system specific settings like the local paths, where the raw dataset files are located:</p>
+<div class="language-yaml highlight"><pre><span></span><code><span id="__span-1-1"><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a><span class="c1"># ./examples/llm_datasets_configs/my_system.yaml</span>
+</span><span id="__span-1-2"><a id="__codelineno-1-2" name="__codelineno-1-2" href="#__codelineno-1-2"></a><span class="nt">local_dirs_by_source_id</span><span class="p">:</span>
+</span><span id="__span-1-3"><a id="__codelineno-1-3" name="__codelineno-1-3" href="#__codelineno-1-3"></a><span class="w">  </span><span class="nt">redpajama</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">/my_system_specific_data_directory/redpajama</span>
+</span></code></pre></div>
+<p>The <a href="https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T">RedPajama dataset</a> requires the manual download prior to the text extraction.
+With the above config, we tell the extraction command the path where we downloaded the RedPajama data by providing the config file:</p>
+<div class="language-bash highlight"><pre><span></span><code><span id="__span-2-1"><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a>llm-datasets<span class="w"> </span>extract_text<span class="w"> </span>redpajama_book<span class="w"> </span>--configs<span class="w"> </span>./examples/llm_datasets_configs/my_system.yaml
+</span></code></pre></div>
+<h2 id="dataset-selection-and-sampling">Dataset selection and sampling</h2>
+<p>The configuration files are also needed for specifying the final dataset composition, including the selection of the datasets and their sampling.
+The following examples shows a config for an Italian dataset:</p>
+<div class="language-yaml highlight"><pre><span></span><code><span id="__span-3-1"><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="c1"># ./examples/llm_datasets_configs/italian_data.yaml</span>
+</span><span id="__span-3-2"><a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a>
+</span><span id="__span-3-3"><a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a><span class="c1"># a fixed random seed for shuffling etc.</span>
+</span><span id="__span-3-4"><a id="__codelineno-3-4" name="__codelineno-3-4" href="#__codelineno-3-4"></a><span class="nt">seed</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">0</span>
+</span><span id="__span-3-5"><a id="__codelineno-3-5" name="__codelineno-3-5" href="#__codelineno-3-5"></a>
+</span><span id="__span-3-6"><a id="__codelineno-3-6" name="__codelineno-3-6" href="#__codelineno-3-6"></a><span class="nt">selected_dataset_ids</span><span class="p">:</span>
+</span><span id="__span-3-7"><a id="__codelineno-3-7" name="__codelineno-3-7" href="#__codelineno-3-7"></a><span class="w">  </span><span class="c1"># italian subsets</span>
+</span><span id="__span-3-8"><a id="__codelineno-3-8" name="__codelineno-3-8" href="#__codelineno-3-8"></a><span class="w">  </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">itwac</span>
+</span><span id="__span-3-9"><a id="__codelineno-3-9" name="__codelineno-3-9" href="#__codelineno-3-9"></a><span class="w">  </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">eurlex_it</span>
+</span><span id="__span-3-10"><a id="__codelineno-3-10" name="__codelineno-3-10" href="#__codelineno-3-10"></a><span class="w">  </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">wikipedia_20231101_it</span>
+</span><span id="__span-3-11"><a id="__codelineno-3-11" name="__codelineno-3-11" href="#__codelineno-3-11"></a><span class="w">  </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">wikibooks_it</span>
+</span><span id="__span-3-12"><a id="__codelineno-3-12" name="__codelineno-3-12" href="#__codelineno-3-12"></a><span class="w">  </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">wikinews_it</span>
+</span><span id="__span-3-13"><a id="__codelineno-3-13" name="__codelineno-3-13" href="#__codelineno-3-13"></a><span class="w">  </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">colossal_oscar_2023-23_it</span>
+</span><span id="__span-3-14"><a id="__codelineno-3-14" name="__codelineno-3-14" href="#__codelineno-3-14"></a><span class="w">  </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">parlamint_it</span>
+</span><span id="__span-3-15"><a id="__codelineno-3-15" name="__codelineno-3-15" href="#__codelineno-3-15"></a>
+</span><span id="__span-3-16"><a id="__codelineno-3-16" name="__codelineno-3-16" href="#__codelineno-3-16"></a><span class="c1"># down-sample webcrawled + up-sampled high quality</span>
+</span><span id="__span-3-17"><a id="__codelineno-3-17" name="__codelineno-3-17" href="#__codelineno-3-17"></a><span class="nt">sampling_factor_by_source_id</span><span class="p">:</span>
+</span><span id="__span-3-18"><a id="__codelineno-3-18" name="__codelineno-3-18" href="#__codelineno-3-18"></a><span class="w">  </span><span class="nt">colossal_oscar</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">0.1</span>
+</span><span id="__span-3-19"><a id="__codelineno-3-19" name="__codelineno-3-19" href="#__codelineno-3-19"></a>
+</span><span id="__span-3-20"><a id="__codelineno-3-20" name="__codelineno-3-20" href="#__codelineno-3-20"></a><span class="nt">sampling_factor_by_dataset_id</span><span class="p">:</span>
+</span><span id="__span-3-21"><a id="__codelineno-3-21" name="__codelineno-3-21" href="#__codelineno-3-21"></a><span class="w">  </span><span class="nt">itwac</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">0.5</span>
+</span><span id="__span-3-22"><a id="__codelineno-3-22" name="__codelineno-3-22" href="#__codelineno-3-22"></a><span class="w">  </span><span class="nt">eurlex_it</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">2</span>
+</span><span id="__span-3-23"><a id="__codelineno-3-23" name="__codelineno-3-23" href="#__codelineno-3-23"></a><span class="w">  </span><span class="nt">wikipedia_20231101_it</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">3</span>
+</span></code></pre></div>
+<p>To use this config, provide the path in the <code>--configs</code> argument:</p>
+<div class="language-bash highlight"><pre><span></span><code><span id="__span-4-1"><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a><span class="c1"># compose final dataset</span>
+</span><span id="__span-4-2"><a id="__codelineno-4-2" name="__codelineno-4-2" href="#__codelineno-4-2"></a>llm-datasets<span class="w"> </span>compose<span class="w"> </span>...<span class="w"> </span>--configs<span class="w"> </span>./examples/llm_datasets_configs/italian_data.yaml
+</span></code></pre></div>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "..", "features": [], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/index.html b/datasets/index.html
new file mode 100644
index 0000000..6a528c5
--- /dev/null
+++ b/datasets/index.html
@@ -0,0 +1,1625 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/">
+      
+      
+        <link rel="prev" href="../overview/">
+      
+      
+        <link rel="next" href="../config-files/">
+      
+      
+      <link rel="icon" href="../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Available datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#languages" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href=".." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Available datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href=".." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href=".." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#languages" class="md-nav__link">
+    <span class="md-ellipsis">
+      Languages
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#data-sources" class="md-nav__link">
+    <span class="md-ellipsis">
+      Data sources
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#languages" class="md-nav__link">
+    <span class="md-ellipsis">
+      Languages
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#data-sources" class="md-nav__link">
+    <span class="md-ellipsis">
+      Data sources
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+  <h1>Available datasets</h1>
+
+<p># Datasets</p>
+<p>The framework provides 2241 datasets from 62 sources in 164 languages. The languages are as follows: Afrikaans, Amharic, Aragonese, Arabic, Arz, Assamese, Ast, Avaric, Azerbaijani, Azb, Bashkir, Belarusian, Bulgarian, Bihari, Bengali, Tibetan, Bpy, Breton, Bosnian, Bxr, Catalan, Chechen, Ceb, Ckb, Code, Czech, Chuvash, Welsh, Danish, German, Dsb, Dhivehi, Greek, English, Esperanto, Spanish, Estonian, Basque, Persian, Finnish, French, Western Frisian, Irish, Gaelic, Galician, Guaraní, Gom, Gsw, Gujarati, Hausa, Hebrew, Hindi, Croatian, Hsb, Haitian, Hungarian, Armenian, Interlingua, Indonesian, Interlingue, Igbo, Ilo, Ido, Icelandic, Italian, Japanese, Jbo, Javanese, Georgian, Kazakh, Khmer, Kannada, Korean, Krc, Kurdish, Komi, Cornish, Kirghiz, Latin, Luxembourgish, Lez, Limburgish, Lmo, Lao, Lithuanian, Latvian, Mai, Malagasy, Mhr, Min, Macedonian, Malayalam, Mongolian, Marathi, Mrj, Malay, Maltese, Multi, Mwl, Burmese, Mzn, Nah, Nds, Nepali, New, Dutch, Norwegian Nynorsk, Norwegian, Chichewa, Occitan, Oromo, Oriya, Ossetian, Panjabi, Polish, Pms, Pnb, Pashto, Portuguese, Quechua, Romanian, Russian, Kinyarwanda, Sanskrit, Sah, Sindhi, Serbo-Croatian, Sinhalese, Slovak, Slovene, Shona, Somali, Albanian, Serbian, Southern Sotho, Sundanese, Swedish, Swahili, Tamil, Telugu, Tajik, Thai, Tigrinya, Turkmen, Tagalog, Turkish, Tatar, Uighur, Ukrainian, Urdu, Uzbek, Vietnamese, Volapük, Walloon, War, Wuu, X-Eml, Xal, Xhosa, Xmf, Yiddish, Yoruba, Chinese, Zu</p>
+<h2 id="languages">Languages</h2>
+<p><img alt="Tokens by language" src="tokens_by_language.png" /></p>
+<table>
+<thead>
+<tr>
+<th style="text-align: left;">language</th>
+<th style="text-align: left;">reported_tokens</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">af</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">am</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">an</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ar</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">arz</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">as</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ast</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">av</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">az</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">azb</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ba</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">be</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">bg</td>
+<td style="text-align: left;">13 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">bh</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">bn</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">bo</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">bpy</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">br</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">bs</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">bxr</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ca</td>
+<td style="text-align: left;">4 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">ce</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ceb</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ckb</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">code</td>
+<td style="text-align: left;">250 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">cs</td>
+<td style="text-align: left;">21 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">cv</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">cy</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">da</td>
+<td style="text-align: left;">11 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">de</td>
+<td style="text-align: left;">26 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">dsb</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">dv</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">el</td>
+<td style="text-align: left;">24 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">en</td>
+<td style="text-align: left;">117 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">eo</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">es</td>
+<td style="text-align: left;">20 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">et</td>
+<td style="text-align: left;">5 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">eu</td>
+<td style="text-align: left;">982 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">fa</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">fi</td>
+<td style="text-align: left;">9 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">fr</td>
+<td style="text-align: left;">60 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">fy</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ga</td>
+<td style="text-align: left;">669 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">gd</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">gl</td>
+<td style="text-align: left;">36 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">gn</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">gom</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">gsw</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">gu</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ha</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">he</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">hi</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">hr</td>
+<td style="text-align: left;">8 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">hsb</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ht</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">hu</td>
+<td style="text-align: left;">12 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">hy</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ia</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">id</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ie</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ig</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ilo</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">io</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">is</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">it</td>
+<td style="text-align: left;">14 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">ja</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">jbo</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">jv</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ka</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">kk</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">km</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">kn</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ko</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">krc</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ku</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">kv</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">kw</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ky</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">la</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">lb</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">lez</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">li</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">lmo</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">lo</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">lt</td>
+<td style="text-align: left;">5 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">lv</td>
+<td style="text-align: left;">4 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">mai</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">mg</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">mhr</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">min</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">mk</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ml</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">mn</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">mr</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">mrj</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ms</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">mt</td>
+<td style="text-align: left;">4 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">multi</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">mwl</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">my</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">mzn</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">nah</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">nds</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ne</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">new</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">nl</td>
+<td style="text-align: left;">26 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">nn</td>
+<td style="text-align: left;">301 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">no</td>
+<td style="text-align: left;">5 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">ny</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">oc</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">om</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">or</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">os</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">pa</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">pl</td>
+<td style="text-align: left;">25 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">pms</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">pnb</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ps</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">pt</td>
+<td style="text-align: left;">24 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">qu</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ro</td>
+<td style="text-align: left;">9 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">ru</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">rw</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">sa</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">sah</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">sd</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">sh</td>
+<td style="text-align: left;">58 k</td>
+</tr>
+<tr>
+<td style="text-align: left;">si</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">sk</td>
+<td style="text-align: left;">18 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">sl</td>
+<td style="text-align: left;">9 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">sn</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">so</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">sq</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">sr</td>
+<td style="text-align: left;">3 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">st</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">su</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">sv</td>
+<td style="text-align: left;">13 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">sw</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ta</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">te</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">tg</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">th</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ti</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">tk</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">tl</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">tr</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">tt</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ug</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">uk</td>
+<td style="text-align: left;">11 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">ur</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">uz</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">vi</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">vo</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">wa</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">war</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">wuu</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">x-eml</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">xal</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">xh</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">xmf</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">yi</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">yo</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">zh</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">zu</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="data-sources">Data sources</h2>
+<p><img alt="Tokens by source" src="tokens_by_source.png" /></p>
+<table>
+<thead>
+<tr>
+<th style="text-align: left;">source_id</th>
+<th style="text-align: left;">reported_tokens</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">curlicat</td>
+<td style="text-align: left;">410 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">macocu</td>
+<td style="text-align: left;">23 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">redpajama</td>
+<td style="text-align: left;">46 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">wura</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">wikihow</td>
+<td style="text-align: left;">2 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">pes2o</td>
+<td style="text-align: left;">42 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">proof_pile</td>
+<td style="text-align: left;">8 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">pile_of_law</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">math_amps</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">edgarcorpus</td>
+<td style="text-align: left;">7 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">bulgarian_news</td>
+<td style="text-align: left;">283 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">bulnc</td>
+<td style="text-align: left;">567 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">openlegaldata</td>
+<td style="text-align: left;">10 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">dewac</td>
+<td style="text-align: left;">2 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">ga_bilingual_legistation</td>
+<td style="text-align: left;">4 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">ga_universal_dependencies</td>
+<td style="text-align: left;">3 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">hrwac</td>
+<td style="text-align: left;">1 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">styria_news</td>
+<td style="text-align: left;">409 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">croatian_news_engri</td>
+<td style="text-align: left;">695 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">itwac</td>
+<td style="text-align: left;">2 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">korpus_malti</td>
+<td style="text-align: left;">366 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">sonar</td>
+<td style="text-align: left;">500 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">cc_gigafida</td>
+<td style="text-align: left;">127 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">academic_slovene_kas</td>
+<td style="text-align: left;">1 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">slwac_web</td>
+<td style="text-align: left;">1 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">sk_court_decisions</td>
+<td style="text-align: left;">11 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">sk_laws</td>
+<td style="text-align: left;">45 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">syn_v9</td>
+<td style="text-align: left;">5 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">cs_en_parallel</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">danish_gigaword</td>
+<td style="text-align: left;">1 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">danewsroom</td>
+<td style="text-align: left;">472 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">dk_clarin</td>
+<td style="text-align: left;">441 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">cabernet</td>
+<td style="text-align: left;">712 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">norwegian_cc</td>
+<td style="text-align: left;">5 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">pl_nkjp</td>
+<td style="text-align: left;">1 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">pl_parliamentary_corpus</td>
+<td style="text-align: left;">671 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">parlamento_pt</td>
+<td style="text-align: left;">819 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">brwac</td>
+<td style="text-align: left;">3 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">seimas_lt_en</td>
+<td style="text-align: left;">48 k</td>
+</tr>
+<tr>
+<td style="text-align: left;">state_related_latvian_web</td>
+<td style="text-align: left;">1 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">greek_legal_code</td>
+<td style="text-align: left;">45 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">greek_web_corpus</td>
+<td style="text-align: left;">3 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">estonian_reference_corpus</td>
+<td style="text-align: left;">175 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">enc2021</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">ekspress</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">euscrawl</td>
+<td style="text-align: left;">846 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">spanish_legal</td>
+<td style="text-align: left;">3 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">ylenews</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">sv_gigaword</td>
+<td style="text-align: left;">1 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">srpkor</td>
+<td style="text-align: left;">N/A</td>
+</tr>
+<tr>
+<td style="text-align: left;">marcell_legislative_subcorpus_v2</td>
+<td style="text-align: left;">31 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">uk_laws</td>
+<td style="text-align: left;">579 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">eurlex</td>
+<td style="text-align: left;">121 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">legal_mc4</td>
+<td style="text-align: left;">29 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">wiki</td>
+<td style="text-align: left;">12 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">wikibooks</td>
+<td style="text-align: left;">353 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">wikiquote</td>
+<td style="text-align: left;">268 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">wikinews</td>
+<td style="text-align: left;">79 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">wikisource</td>
+<td style="text-align: left;">2 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">wikivoyage</td>
+<td style="text-align: left;">132 M</td>
+</tr>
+<tr>
+<td style="text-align: left;">colossal_oscar</td>
+<td style="text-align: left;">154 B</td>
+</tr>
+<tr>
+<td style="text-align: left;">starcoder</td>
+<td style="text-align: left;">250 B</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "..", "features": [], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_af/index.html b/datasets/language_af/index.html
new file mode 100644
index 0000000..f1425ed
--- /dev/null
+++ b/datasets/language_af/index.html
@@ -0,0 +1,1197 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_af/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Afrikaans Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#afrikaans-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Afrikaans Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-af-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [af; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-af-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [af; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-af-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [af; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-af-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [af; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-af-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [af; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-af-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [af; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-af-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [af; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-af-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [af; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-af-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [af; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-af-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [af; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-af-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [af; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-af-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [af; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wura-afrikaans" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Afrikaans]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="afrikaans-datasets">Afrikaans Datasets</h1>
+<p>There are in total 13 datasets with N/A tokens in Afrikaans language.</p>
+<h2 id="colossal-oscar-1-af-2015-14">Colossal OSCAR 1 [af; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_af</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [af; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-af-2016-40">Colossal OSCAR 1 [af; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_af</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [af; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-af-2017-43">Colossal OSCAR 1 [af; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_af</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [af; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-af-2018-47">Colossal OSCAR 1 [af; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_af</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [af; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-af-2019-22">Colossal OSCAR 1 [af; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_af</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [af; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-af-2020-24">Colossal OSCAR 1 [af; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_af</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [af; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-af-2020-45">Colossal OSCAR 1 [af; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_af</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [af; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-af-2021-49">Colossal OSCAR 1 [af; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_af</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [af; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-af-2022-27">Colossal OSCAR 1 [af; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_af</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [af; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-af-2022-49">Colossal OSCAR 1 [af; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_af</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [af; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-af-2023-14">Colossal OSCAR 1 [af; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_af</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [af; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-af-2023-23">Colossal OSCAR 1 [af; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_af</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [af; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wura-afrikaans">WURA [Afrikaans]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_af</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Afrikaans]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_am/index.html b/datasets/language_am/index.html
new file mode 100644
index 0000000..45b2343
--- /dev/null
+++ b/datasets/language_am/index.html
@@ -0,0 +1,1197 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_am/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Amharic Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#amharic-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Amharic Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-am-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [am; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-am-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [am; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-am-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [am; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-am-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [am; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-am-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [am; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-am-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [am; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-am-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [am; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-am-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [am; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-am-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [am; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-am-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [am; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-am-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [am; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-am-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [am; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wura-amharic" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Amharic]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="amharic-datasets">Amharic Datasets</h1>
+<p>There are in total 13 datasets with N/A tokens in Amharic language.</p>
+<h2 id="colossal-oscar-1-am-2015-14">Colossal OSCAR 1 [am; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_am</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [am; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-am-2016-40">Colossal OSCAR 1 [am; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_am</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [am; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-am-2017-43">Colossal OSCAR 1 [am; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_am</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [am; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-am-2018-47">Colossal OSCAR 1 [am; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_am</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [am; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-am-2019-22">Colossal OSCAR 1 [am; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_am</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [am; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-am-2020-24">Colossal OSCAR 1 [am; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_am</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [am; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-am-2020-45">Colossal OSCAR 1 [am; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_am</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [am; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-am-2021-49">Colossal OSCAR 1 [am; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_am</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [am; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-am-2022-27">Colossal OSCAR 1 [am; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_am</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [am; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-am-2022-49">Colossal OSCAR 1 [am; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_am</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [am; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-am-2023-14">Colossal OSCAR 1 [am; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_am</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [am; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-am-2023-23">Colossal OSCAR 1 [am; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_am</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [am; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wura-amharic">WURA [Amharic]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_am</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Amharic]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_an/index.html b/datasets/language_an/index.html
new file mode 100644
index 0000000..90e2d23
--- /dev/null
+++ b/datasets/language_an/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_an/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Aragonese Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#aragonese-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Aragonese Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-an-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [an; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-an-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [an; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-an-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [an; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-an-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [an; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-an-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [an; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-an-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [an; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-an-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [an; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-an-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [an; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-an-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [an; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-an-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [an; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-an-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [an; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-an-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [an; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="aragonese-datasets">Aragonese Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Aragonese language.</p>
+<h2 id="colossal-oscar-1-an-2015-14">Colossal OSCAR 1 [an; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_an</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [an; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-an-2016-40">Colossal OSCAR 1 [an; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_an</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [an; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-an-2017-43">Colossal OSCAR 1 [an; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_an</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [an; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-an-2018-47">Colossal OSCAR 1 [an; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_an</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [an; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-an-2019-22">Colossal OSCAR 1 [an; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_an</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [an; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-an-2020-24">Colossal OSCAR 1 [an; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_an</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [an; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-an-2020-45">Colossal OSCAR 1 [an; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_an</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [an; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-an-2021-49">Colossal OSCAR 1 [an; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_an</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [an; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-an-2022-27">Colossal OSCAR 1 [an; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_an</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [an; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-an-2022-49">Colossal OSCAR 1 [an; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_an</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [an; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-an-2023-14">Colossal OSCAR 1 [an; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_an</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [an; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-an-2023-23">Colossal OSCAR 1 [an; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_an</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [an; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ar/index.html b/datasets/language_ar/index.html
new file mode 100644
index 0000000..d2e44b2
--- /dev/null
+++ b/datasets/language_ar/index.html
@@ -0,0 +1,1197 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ar/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Arabic Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#arabic-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Arabic Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ar-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ar; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ar-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ar; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ar-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ar; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ar-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ar; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ar-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ar; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ar-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ar; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ar-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ar; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ar-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ar; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ar-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ar; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ar-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ar; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ar-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ar; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ar-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ar; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wura-egyptian-arabic" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Egyptian Arabic]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="arabic-datasets">Arabic Datasets</h1>
+<p>There are in total 13 datasets with N/A tokens in Arabic language.</p>
+<h2 id="colossal-oscar-1-ar-2015-14">Colossal OSCAR 1 [ar; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ar</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ar; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ar-2016-40">Colossal OSCAR 1 [ar; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ar</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ar; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ar-2017-43">Colossal OSCAR 1 [ar; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ar</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ar; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ar-2018-47">Colossal OSCAR 1 [ar; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ar</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ar; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ar-2019-22">Colossal OSCAR 1 [ar; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ar</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ar; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ar-2020-24">Colossal OSCAR 1 [ar; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ar</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ar; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ar-2020-45">Colossal OSCAR 1 [ar; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ar</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ar; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ar-2021-49">Colossal OSCAR 1 [ar; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ar</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ar; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ar-2022-27">Colossal OSCAR 1 [ar; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ar</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ar; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ar-2022-49">Colossal OSCAR 1 [ar; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ar</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ar; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ar-2023-14">Colossal OSCAR 1 [ar; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ar</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ar; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ar-2023-23">Colossal OSCAR 1 [ar; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ar</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ar; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wura-egyptian-arabic">WURA [Egyptian Arabic]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_arz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Egyptian Arabic]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_arz/index.html b/datasets/language_arz/index.html
new file mode 100644
index 0000000..36b1b02
--- /dev/null
+++ b/datasets/language_arz/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_arz/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Arz Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#arz-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Arz Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-arz-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [arz; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-arz-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [arz; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-arz-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [arz; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-arz-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [arz; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-arz-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [arz; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-arz-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [arz; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-arz-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [arz; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-arz-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [arz; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-arz-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [arz; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-arz-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [arz; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-arz-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [arz; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-arz-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [arz; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="arz-datasets">Arz Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Arz language.</p>
+<h2 id="colossal-oscar-1-arz-2015-14">Colossal OSCAR 1 [arz; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_arz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [arz; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-arz-2016-40">Colossal OSCAR 1 [arz; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_arz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [arz; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-arz-2017-43">Colossal OSCAR 1 [arz; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_arz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [arz; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-arz-2018-47">Colossal OSCAR 1 [arz; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_arz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [arz; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-arz-2019-22">Colossal OSCAR 1 [arz; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_arz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [arz; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-arz-2020-24">Colossal OSCAR 1 [arz; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_arz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [arz; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-arz-2020-45">Colossal OSCAR 1 [arz; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_arz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [arz; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-arz-2021-49">Colossal OSCAR 1 [arz; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_arz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [arz; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-arz-2022-27">Colossal OSCAR 1 [arz; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_arz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [arz; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-arz-2022-49">Colossal OSCAR 1 [arz; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_arz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [arz; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-arz-2023-14">Colossal OSCAR 1 [arz; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_arz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [arz; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-arz-2023-23">Colossal OSCAR 1 [arz; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_arz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [arz; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_as/index.html b/datasets/language_as/index.html
new file mode 100644
index 0000000..8b26901
--- /dev/null
+++ b/datasets/language_as/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_as/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Assamese Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#assamese-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Assamese Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-as-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [as; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-as-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [as; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-as-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [as; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-as-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [as; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-as-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [as; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-as-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [as; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-as-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [as; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-as-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [as; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-as-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [as; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-as-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [as; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-as-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [as; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-as-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [as; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="assamese-datasets">Assamese Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Assamese language.</p>
+<h2 id="colossal-oscar-1-as-2015-14">Colossal OSCAR 1 [as; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_as</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [as; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-as-2016-40">Colossal OSCAR 1 [as; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_as</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [as; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-as-2017-43">Colossal OSCAR 1 [as; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_as</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [as; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-as-2018-47">Colossal OSCAR 1 [as; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_as</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [as; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-as-2019-22">Colossal OSCAR 1 [as; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_as</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [as; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-as-2020-24">Colossal OSCAR 1 [as; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_as</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [as; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-as-2020-45">Colossal OSCAR 1 [as; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_as</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [as; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-as-2021-49">Colossal OSCAR 1 [as; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_as</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [as; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-as-2022-27">Colossal OSCAR 1 [as; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_as</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [as; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-as-2022-49">Colossal OSCAR 1 [as; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_as</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [as; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-as-2023-14">Colossal OSCAR 1 [as; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_as</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [as; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-as-2023-23">Colossal OSCAR 1 [as; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_as</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [as; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ast/index.html b/datasets/language_ast/index.html
new file mode 100644
index 0000000..3246e27
--- /dev/null
+++ b/datasets/language_ast/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ast/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Ast Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#ast-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Ast Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ast-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ast; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ast-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ast; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ast-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ast; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ast-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ast; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ast-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ast; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ast-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ast; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ast-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ast; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ast-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ast; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ast-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ast; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ast-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ast; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ast-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ast; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ast-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ast; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="ast-datasets">Ast Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Ast language.</p>
+<h2 id="colossal-oscar-1-ast-2015-14">Colossal OSCAR 1 [ast; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ast</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ast; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ast-2016-40">Colossal OSCAR 1 [ast; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ast</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ast; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ast-2017-43">Colossal OSCAR 1 [ast; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ast</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ast; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ast-2018-47">Colossal OSCAR 1 [ast; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ast</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ast; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ast-2019-22">Colossal OSCAR 1 [ast; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ast</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ast; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ast-2020-24">Colossal OSCAR 1 [ast; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ast</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ast; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ast-2020-45">Colossal OSCAR 1 [ast; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ast</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ast; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ast-2021-49">Colossal OSCAR 1 [ast; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ast</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ast; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ast-2022-27">Colossal OSCAR 1 [ast; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ast</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ast; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ast-2022-49">Colossal OSCAR 1 [ast; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ast</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ast; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ast-2023-14">Colossal OSCAR 1 [ast; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ast</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ast; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ast-2023-23">Colossal OSCAR 1 [ast; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ast</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ast; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_av/index.html b/datasets/language_av/index.html
new file mode 100644
index 0000000..601ecbc
--- /dev/null
+++ b/datasets/language_av/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_av/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Avaric Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#avaric-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Avaric Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-av-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [av; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-av-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [av; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-av-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [av; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-av-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [av; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-av-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [av; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-av-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [av; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-av-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [av; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-av-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [av; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-av-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [av; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-av-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [av; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-av-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [av; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-av-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [av; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="avaric-datasets">Avaric Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Avaric language.</p>
+<h2 id="colossal-oscar-1-av-2015-14">Colossal OSCAR 1 [av; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_av</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [av; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-av-2016-40">Colossal OSCAR 1 [av; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_av</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [av; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-av-2017-43">Colossal OSCAR 1 [av; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_av</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [av; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-av-2018-47">Colossal OSCAR 1 [av; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_av</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [av; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-av-2019-22">Colossal OSCAR 1 [av; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_av</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [av; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-av-2020-24">Colossal OSCAR 1 [av; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_av</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [av; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-av-2020-45">Colossal OSCAR 1 [av; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_av</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [av; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-av-2021-49">Colossal OSCAR 1 [av; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_av</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [av; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-av-2022-27">Colossal OSCAR 1 [av; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_av</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [av; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-av-2022-49">Colossal OSCAR 1 [av; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_av</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [av; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-av-2023-14">Colossal OSCAR 1 [av; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_av</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [av; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-av-2023-23">Colossal OSCAR 1 [av; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_av</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [av; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_az/index.html b/datasets/language_az/index.html
new file mode 100644
index 0000000..17a4025
--- /dev/null
+++ b/datasets/language_az/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_az/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Azerbaijani Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#azerbaijani-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Azerbaijani Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-az-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [az; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-az-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [az; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-az-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [az; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-az-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [az; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-az-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [az; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-az-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [az; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-az-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [az; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-az-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [az; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-az-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [az; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-az-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [az; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-az-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [az; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-az-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [az; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="azerbaijani-datasets">Azerbaijani Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Azerbaijani language.</p>
+<h2 id="colossal-oscar-1-az-2015-14">Colossal OSCAR 1 [az; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_az</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [az; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-az-2016-40">Colossal OSCAR 1 [az; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_az</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [az; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-az-2017-43">Colossal OSCAR 1 [az; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_az</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [az; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-az-2018-47">Colossal OSCAR 1 [az; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_az</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [az; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-az-2019-22">Colossal OSCAR 1 [az; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_az</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [az; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-az-2020-24">Colossal OSCAR 1 [az; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_az</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [az; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-az-2020-45">Colossal OSCAR 1 [az; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_az</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [az; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-az-2021-49">Colossal OSCAR 1 [az; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_az</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [az; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-az-2022-27">Colossal OSCAR 1 [az; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_az</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [az; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-az-2022-49">Colossal OSCAR 1 [az; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_az</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [az; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-az-2023-14">Colossal OSCAR 1 [az; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_az</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [az; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-az-2023-23">Colossal OSCAR 1 [az; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_az</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [az; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_azb/index.html b/datasets/language_azb/index.html
new file mode 100644
index 0000000..8c1255c
--- /dev/null
+++ b/datasets/language_azb/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_azb/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Azb Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#azb-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Azb Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-azb-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [azb; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-azb-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [azb; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-azb-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [azb; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-azb-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [azb; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-azb-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [azb; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-azb-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [azb; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-azb-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [azb; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-azb-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [azb; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-azb-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [azb; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-azb-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [azb; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-azb-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [azb; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-azb-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [azb; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="azb-datasets">Azb Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Azb language.</p>
+<h2 id="colossal-oscar-1-azb-2015-14">Colossal OSCAR 1 [azb; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_azb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [azb; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-azb-2016-40">Colossal OSCAR 1 [azb; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_azb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [azb; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-azb-2017-43">Colossal OSCAR 1 [azb; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_azb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [azb; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-azb-2018-47">Colossal OSCAR 1 [azb; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_azb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [azb; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-azb-2019-22">Colossal OSCAR 1 [azb; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_azb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [azb; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-azb-2020-24">Colossal OSCAR 1 [azb; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_azb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [azb; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-azb-2020-45">Colossal OSCAR 1 [azb; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_azb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [azb; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-azb-2021-49">Colossal OSCAR 1 [azb; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_azb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [azb; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-azb-2022-27">Colossal OSCAR 1 [azb; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_azb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [azb; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-azb-2022-49">Colossal OSCAR 1 [azb; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_azb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [azb; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-azb-2023-14">Colossal OSCAR 1 [azb; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_azb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [azb; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-azb-2023-23">Colossal OSCAR 1 [azb; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_azb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [azb; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ba/index.html b/datasets/language_ba/index.html
new file mode 100644
index 0000000..1ba65a4
--- /dev/null
+++ b/datasets/language_ba/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ba/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Bashkir Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#bashkir-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Bashkir Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ba-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ba; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ba-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ba; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ba-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ba; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ba-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ba; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ba-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ba; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ba-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ba; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ba-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ba; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ba-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ba; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ba-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ba; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ba-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ba; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ba-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ba; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ba-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ba; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="bashkir-datasets">Bashkir Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Bashkir language.</p>
+<h2 id="colossal-oscar-1-ba-2015-14">Colossal OSCAR 1 [ba; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ba</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ba; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ba-2016-40">Colossal OSCAR 1 [ba; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ba</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ba; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ba-2017-43">Colossal OSCAR 1 [ba; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ba</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ba; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ba-2018-47">Colossal OSCAR 1 [ba; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ba</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ba; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ba-2019-22">Colossal OSCAR 1 [ba; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ba</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ba; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ba-2020-24">Colossal OSCAR 1 [ba; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ba</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ba; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ba-2020-45">Colossal OSCAR 1 [ba; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ba</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ba; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ba-2021-49">Colossal OSCAR 1 [ba; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ba</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ba; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ba-2022-27">Colossal OSCAR 1 [ba; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ba</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ba; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ba-2022-49">Colossal OSCAR 1 [ba; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ba</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ba; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ba-2023-14">Colossal OSCAR 1 [ba; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ba</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ba; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ba-2023-23">Colossal OSCAR 1 [ba; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ba</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ba; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_be/index.html b/datasets/language_be/index.html
new file mode 100644
index 0000000..0a23c38
--- /dev/null
+++ b/datasets/language_be/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_be/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Belarusian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#belarusian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Belarusian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-be-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [be; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-be-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [be; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-be-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [be; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-be-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [be; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-be-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [be; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-be-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [be; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-be-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [be; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-be-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [be; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-be-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [be; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-be-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [be; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-be-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [be; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-be-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [be; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="belarusian-datasets">Belarusian Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Belarusian language.</p>
+<h2 id="colossal-oscar-1-be-2015-14">Colossal OSCAR 1 [be; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_be</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [be; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-be-2016-40">Colossal OSCAR 1 [be; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_be</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [be; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-be-2017-43">Colossal OSCAR 1 [be; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_be</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [be; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-be-2018-47">Colossal OSCAR 1 [be; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_be</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [be; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-be-2019-22">Colossal OSCAR 1 [be; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_be</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [be; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-be-2020-24">Colossal OSCAR 1 [be; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_be</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [be; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-be-2020-45">Colossal OSCAR 1 [be; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_be</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [be; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-be-2021-49">Colossal OSCAR 1 [be; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_be</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [be; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-be-2022-27">Colossal OSCAR 1 [be; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_be</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [be; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-be-2022-49">Colossal OSCAR 1 [be; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_be</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [be; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-be-2023-14">Colossal OSCAR 1 [be; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_be</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [be; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-be-2023-23">Colossal OSCAR 1 [be; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_be</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [be; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_bg/index.html b/datasets/language_bg/index.html
new file mode 100644
index 0000000..8b363a3
--- /dev/null
+++ b/datasets/language_bg/index.html
@@ -0,0 +1,1633 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_bg/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Bulgarian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#bulgarian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Bulgarian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#bulgarian-national-corpus" class="md-nav__link">
+    <span class="md-ellipsis">
+      Bulgarian National Corpus
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#curlicat-corpus-bulgarian" class="md-nav__link">
+    <span class="md-ellipsis">
+      CURLICAT Corpus [Bulgarian]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bg-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bg; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bg-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bg; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bg-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bg; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bg-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bg; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bg-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bg; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bg-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bg; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bg-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bg; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bg-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bg; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bg-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bg; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bg-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bg; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bg-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bg; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bg-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bg; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#crawl-of-bulgarian-news-websites" class="md-nav__link">
+    <span class="md-ellipsis">
+      Crawl of Bulgarian news websites
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-bg" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [bg]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-bg" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [bg]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#macocu-web-corpus-bulgarian-20" class="md-nav__link">
+    <span class="md-ellipsis">
+      MaCoCu web corpus [Bulgarian 2.0]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-bg" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [bg]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-bg" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [bg]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-bg" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [bg]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-bg" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [bg]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-bg" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [bg]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="bulgarian-datasets">Bulgarian Datasets</h1>
+<p>There are in total 23 datasets with 13 B tokens in Bulgarian language.</p>
+<h2 id="bulgarian-national-corpus">Bulgarian National Corpus</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>bulnc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Bulgarian National Corpus</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The Bulgarian National Corpus contains a wide range of texts in various sizes, media types (written and spoken), styles, periods (synchronic and diachronic), and licenses. Each text in the collection is supplied with metadata. The Bulgarian National Corpus  was first compiled using the Bulgarian Lexicographic Archive and the Text Archive of Written Bulgarian, which account for 55.95% of the corpus. Later, the EMEA corpus (medical administrative texts) and the OpenSubtitles corpus (film subtitles) were added, accounting for 1.27% and 8.61% of the BulNC, respectively. The remaining texts were crawled automatically and include a large number of administrative texts, news from monolingual and multilingual sources, scientific texts, and popular science. The BulNC is not fully downloadable due to the inclusion of copyrighted material. We've provided a link to a password-protected archive for evaluation.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>on_request</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[None]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>research only (commercial use: None, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>567 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="curlicat-corpus-bulgarian">CURLICAT Corpus [Bulgarian]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>curlicat_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>CURLICAT Corpus [Bulgarian]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The CURLICAT corpus includes 7 monolingual corpora (Bulgarian, Croatian, Hungarian, Polish, Romanian, Slovak and Slovenian) containing selected samples from respective national corpora.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://elrc-share.eu/repository/browse/curlicat-bulgarian-corpus/fed6af2a590311ed9c1a00155d0267062ed273d01d2343f1b78d08d4d481679d/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC-BY-SA-4.0 (commercial use: None, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>35 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bg-2015-14">Colossal OSCAR 1 [bg; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bg; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bg-2016-40">Colossal OSCAR 1 [bg; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bg; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bg-2017-43">Colossal OSCAR 1 [bg; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bg; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bg-2018-47">Colossal OSCAR 1 [bg; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bg; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bg-2019-22">Colossal OSCAR 1 [bg; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bg; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bg-2020-24">Colossal OSCAR 1 [bg; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bg; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bg-2020-45">Colossal OSCAR 1 [bg; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bg; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bg-2021-49">Colossal OSCAR 1 [bg; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bg; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bg-2022-27">Colossal OSCAR 1 [bg; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bg; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bg-2022-49">Colossal OSCAR 1 [bg; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bg; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bg-2023-14">Colossal OSCAR 1 [bg; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bg; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bg-2023-23">Colossal OSCAR 1 [bg; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bg; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="crawl-of-bulgarian-news-websites">Crawl of Bulgarian news websites</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>bulgarian_news</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Crawl of Bulgarian news websites</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The collection was collected by crawling Bulgarian websites in Bulgarian. Text samples are in json format. We can provide raw tests.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>on_request</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[None]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>research only (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>283 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-bg">EurlexResources [bg]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [bg]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-bg">LegalMC4 [bg]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [bg]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="macocu-web-corpus-bulgarian-20">MaCoCu web corpus [Bulgarian 2.0]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>macocu_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>MaCoCu web corpus [Bulgarian 2.0]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://www.clarin.si/repository/xmlui/handle/11356/1800]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC0-No Rights Reserved (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-bg">Wikibooks [bg]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [bg]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-bg">Wikinews [bg]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [bg]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-bg">Wikipedia [bg]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [bg]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>356 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-bg">Wikiquote [bg]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [bg]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>11 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-bg">Wikisource [bg]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_bg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [bg]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>25 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_bh/index.html b/datasets/language_bh/index.html
new file mode 100644
index 0000000..66b0ac2
--- /dev/null
+++ b/datasets/language_bh/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_bh/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Bihari Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#bihari-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Bihari Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bh-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bh; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bh-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bh; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bh-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bh; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bh-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bh; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bh-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bh; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bh-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bh; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bh-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bh; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bh-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bh; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bh-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bh; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bh-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bh; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bh-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bh; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bh-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bh; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="bihari-datasets">Bihari Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Bihari language.</p>
+<h2 id="colossal-oscar-1-bh-2015-14">Colossal OSCAR 1 [bh; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_bh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bh; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bh-2016-40">Colossal OSCAR 1 [bh; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_bh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bh; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bh-2017-43">Colossal OSCAR 1 [bh; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_bh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bh; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bh-2018-47">Colossal OSCAR 1 [bh; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_bh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bh; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bh-2019-22">Colossal OSCAR 1 [bh; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_bh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bh; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bh-2020-24">Colossal OSCAR 1 [bh; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_bh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bh; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bh-2020-45">Colossal OSCAR 1 [bh; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_bh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bh; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bh-2021-49">Colossal OSCAR 1 [bh; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_bh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bh; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bh-2022-27">Colossal OSCAR 1 [bh; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_bh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bh; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bh-2022-49">Colossal OSCAR 1 [bh; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_bh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bh; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bh-2023-14">Colossal OSCAR 1 [bh; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_bh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bh; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bh-2023-23">Colossal OSCAR 1 [bh; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_bh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bh; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_bn/index.html b/datasets/language_bn/index.html
new file mode 100644
index 0000000..abb2afa
--- /dev/null
+++ b/datasets/language_bn/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_bn/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Bengali Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#bengali-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Bengali Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bn-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bn; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bn-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bn; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bn-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bn; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bn-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bn; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bn-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bn; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bn-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bn; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bn-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bn; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bn-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bn; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bn-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bn; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bn-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bn; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bn-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bn; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bn-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bn; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="bengali-datasets">Bengali Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Bengali language.</p>
+<h2 id="colossal-oscar-1-bn-2015-14">Colossal OSCAR 1 [bn; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_bn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bn; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bn-2016-40">Colossal OSCAR 1 [bn; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_bn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bn; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bn-2017-43">Colossal OSCAR 1 [bn; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_bn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bn; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bn-2018-47">Colossal OSCAR 1 [bn; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_bn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bn; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bn-2019-22">Colossal OSCAR 1 [bn; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_bn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bn; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bn-2020-24">Colossal OSCAR 1 [bn; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_bn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bn; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bn-2020-45">Colossal OSCAR 1 [bn; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_bn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bn; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bn-2021-49">Colossal OSCAR 1 [bn; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_bn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bn; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bn-2022-27">Colossal OSCAR 1 [bn; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_bn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bn; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bn-2022-49">Colossal OSCAR 1 [bn; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_bn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bn; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bn-2023-14">Colossal OSCAR 1 [bn; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_bn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bn; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bn-2023-23">Colossal OSCAR 1 [bn; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_bn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bn; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_bo/index.html b/datasets/language_bo/index.html
new file mode 100644
index 0000000..c790db4
--- /dev/null
+++ b/datasets/language_bo/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_bo/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Tibetan Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#tibetan-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Tibetan Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bo-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bo; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bo-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bo; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bo-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bo; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bo-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bo; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bo-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bo; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bo-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bo; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bo-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bo; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bo-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bo; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bo-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bo; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bo-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bo; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bo-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bo; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bo-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bo; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="tibetan-datasets">Tibetan Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Tibetan language.</p>
+<h2 id="colossal-oscar-1-bo-2015-14">Colossal OSCAR 1 [bo; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_bo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bo; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bo-2016-40">Colossal OSCAR 1 [bo; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_bo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bo; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bo-2017-43">Colossal OSCAR 1 [bo; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_bo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bo; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bo-2018-47">Colossal OSCAR 1 [bo; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_bo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bo; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bo-2019-22">Colossal OSCAR 1 [bo; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_bo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bo; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bo-2020-24">Colossal OSCAR 1 [bo; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_bo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bo; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bo-2020-45">Colossal OSCAR 1 [bo; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_bo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bo; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bo-2021-49">Colossal OSCAR 1 [bo; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_bo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bo; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bo-2022-27">Colossal OSCAR 1 [bo; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_bo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bo; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bo-2022-49">Colossal OSCAR 1 [bo; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_bo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bo; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bo-2023-14">Colossal OSCAR 1 [bo; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_bo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bo; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bo-2023-23">Colossal OSCAR 1 [bo; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_bo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bo; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_bpy/index.html b/datasets/language_bpy/index.html
new file mode 100644
index 0000000..c486f23
--- /dev/null
+++ b/datasets/language_bpy/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_bpy/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Bpy Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#bpy-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Bpy Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bpy-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bpy; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bpy-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bpy; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bpy-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bpy; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bpy-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bpy; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bpy-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bpy; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bpy-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bpy; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bpy-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bpy; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bpy-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bpy; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bpy-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bpy; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bpy-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bpy; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bpy-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bpy; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bpy-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bpy; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="bpy-datasets">Bpy Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Bpy language.</p>
+<h2 id="colossal-oscar-1-bpy-2015-14">Colossal OSCAR 1 [bpy; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_bpy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bpy; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bpy-2016-40">Colossal OSCAR 1 [bpy; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_bpy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bpy; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bpy-2017-43">Colossal OSCAR 1 [bpy; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_bpy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bpy; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bpy-2018-47">Colossal OSCAR 1 [bpy; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_bpy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bpy; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bpy-2019-22">Colossal OSCAR 1 [bpy; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_bpy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bpy; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bpy-2020-24">Colossal OSCAR 1 [bpy; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_bpy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bpy; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bpy-2020-45">Colossal OSCAR 1 [bpy; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_bpy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bpy; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bpy-2021-49">Colossal OSCAR 1 [bpy; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_bpy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bpy; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bpy-2022-27">Colossal OSCAR 1 [bpy; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_bpy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bpy; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bpy-2022-49">Colossal OSCAR 1 [bpy; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_bpy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bpy; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bpy-2023-14">Colossal OSCAR 1 [bpy; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_bpy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bpy; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bpy-2023-23">Colossal OSCAR 1 [bpy; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_bpy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bpy; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_br/index.html b/datasets/language_br/index.html
new file mode 100644
index 0000000..714c3a2
--- /dev/null
+++ b/datasets/language_br/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_br/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Breton Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#breton-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Breton Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-br-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [br; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-br-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [br; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-br-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [br; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-br-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [br; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-br-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [br; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-br-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [br; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-br-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [br; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-br-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [br; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-br-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [br; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-br-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [br; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-br-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [br; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-br-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [br; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="breton-datasets">Breton Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Breton language.</p>
+<h2 id="colossal-oscar-1-br-2015-14">Colossal OSCAR 1 [br; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_br</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [br; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-br-2016-40">Colossal OSCAR 1 [br; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_br</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [br; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-br-2017-43">Colossal OSCAR 1 [br; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_br</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [br; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-br-2018-47">Colossal OSCAR 1 [br; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_br</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [br; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-br-2019-22">Colossal OSCAR 1 [br; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_br</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [br; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-br-2020-24">Colossal OSCAR 1 [br; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_br</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [br; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-br-2020-45">Colossal OSCAR 1 [br; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_br</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [br; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-br-2021-49">Colossal OSCAR 1 [br; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_br</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [br; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-br-2022-27">Colossal OSCAR 1 [br; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_br</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [br; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-br-2022-49">Colossal OSCAR 1 [br; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_br</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [br; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-br-2023-14">Colossal OSCAR 1 [br; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_br</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [br; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-br-2023-23">Colossal OSCAR 1 [br; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_br</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [br; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_bs/index.html b/datasets/language_bs/index.html
new file mode 100644
index 0000000..657d77c
--- /dev/null
+++ b/datasets/language_bs/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_bs/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Bosnian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#bosnian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Bosnian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bs-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bs; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bs-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bs; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bs-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bs; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bs-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bs; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bs-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bs; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bs-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bs; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bs-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bs; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bs-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bs; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bs-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bs; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bs-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bs; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bs-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bs; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bs-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bs; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="bosnian-datasets">Bosnian Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Bosnian language.</p>
+<h2 id="colossal-oscar-1-bs-2015-14">Colossal OSCAR 1 [bs; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_bs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bs; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bs-2016-40">Colossal OSCAR 1 [bs; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_bs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bs; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bs-2017-43">Colossal OSCAR 1 [bs; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_bs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bs; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bs-2018-47">Colossal OSCAR 1 [bs; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_bs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bs; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bs-2019-22">Colossal OSCAR 1 [bs; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_bs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bs; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bs-2020-24">Colossal OSCAR 1 [bs; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_bs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bs; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bs-2020-45">Colossal OSCAR 1 [bs; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_bs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bs; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bs-2021-49">Colossal OSCAR 1 [bs; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_bs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bs; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bs-2022-27">Colossal OSCAR 1 [bs; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_bs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bs; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bs-2022-49">Colossal OSCAR 1 [bs; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_bs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bs; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bs-2023-14">Colossal OSCAR 1 [bs; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_bs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bs; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bs-2023-23">Colossal OSCAR 1 [bs; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_bs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bs; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_bxr/index.html b/datasets/language_bxr/index.html
new file mode 100644
index 0000000..7f79340
--- /dev/null
+++ b/datasets/language_bxr/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_bxr/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Bxr Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#bxr-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Bxr Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bxr-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bxr; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bxr-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bxr; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bxr-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bxr; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bxr-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bxr; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bxr-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bxr; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bxr-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bxr; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bxr-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bxr; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bxr-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bxr; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bxr-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bxr; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bxr-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bxr; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bxr-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bxr; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-bxr-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [bxr; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="bxr-datasets">Bxr Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Bxr language.</p>
+<h2 id="colossal-oscar-1-bxr-2015-14">Colossal OSCAR 1 [bxr; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_bxr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bxr; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bxr-2016-40">Colossal OSCAR 1 [bxr; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_bxr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bxr; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bxr-2017-43">Colossal OSCAR 1 [bxr; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_bxr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bxr; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bxr-2018-47">Colossal OSCAR 1 [bxr; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_bxr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bxr; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bxr-2019-22">Colossal OSCAR 1 [bxr; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_bxr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bxr; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bxr-2020-24">Colossal OSCAR 1 [bxr; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_bxr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bxr; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bxr-2020-45">Colossal OSCAR 1 [bxr; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_bxr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bxr; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bxr-2021-49">Colossal OSCAR 1 [bxr; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_bxr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bxr; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bxr-2022-27">Colossal OSCAR 1 [bxr; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_bxr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bxr; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bxr-2022-49">Colossal OSCAR 1 [bxr; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_bxr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bxr; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bxr-2023-14">Colossal OSCAR 1 [bxr; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_bxr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bxr; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-bxr-2023-23">Colossal OSCAR 1 [bxr; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_bxr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [bxr; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ca/index.html b/datasets/language_ca/index.html
new file mode 100644
index 0000000..0021f95
--- /dev/null
+++ b/datasets/language_ca/index.html
@@ -0,0 +1,1457 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ca/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Catalan Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#catalan-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Catalan Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ca-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ca; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ca-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ca; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ca-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ca; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ca-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ca; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ca-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ca; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ca-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ca; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ca-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ca; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ca-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ca; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ca-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ca; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ca-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ca; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ca-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ca; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ca-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ca; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#macocu-web-corpus-catalan-10" class="md-nav__link">
+    <span class="md-ellipsis">
+      MaCoCu web corpus [Catalan 1.0]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-ca" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [ca]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-ca" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [ca]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-ca" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [ca]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-ca" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [ca]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-ca" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [ca]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-ca" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [ca]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="catalan-datasets">Catalan Datasets</h1>
+<p>There are in total 19 datasets with 4 B tokens in Catalan language.</p>
+<h2 id="colossal-oscar-1-ca-2015-14">Colossal OSCAR 1 [ca; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ca; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ca-2016-40">Colossal OSCAR 1 [ca; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ca; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ca-2017-43">Colossal OSCAR 1 [ca; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ca; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ca-2018-47">Colossal OSCAR 1 [ca; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ca; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ca-2019-22">Colossal OSCAR 1 [ca; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ca; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ca-2020-24">Colossal OSCAR 1 [ca; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ca; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ca-2020-45">Colossal OSCAR 1 [ca; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ca; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ca-2021-49">Colossal OSCAR 1 [ca; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ca; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ca-2022-27">Colossal OSCAR 1 [ca; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ca; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ca-2022-49">Colossal OSCAR 1 [ca; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ca; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ca-2023-14">Colossal OSCAR 1 [ca; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ca; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ca-2023-23">Colossal OSCAR 1 [ca; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ca; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="macocu-web-corpus-catalan-10">MaCoCu web corpus [Catalan 1.0]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>macocu_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>MaCoCu web corpus [Catalan 1.0]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://www.clarin.si/repository/xmlui/handle/11356/1837]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC0-No Rights Reserved (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-ca">Wikibooks [ca]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [ca]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-ca">Wikinews [ca]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [ca]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-ca">Wikipedia [ca]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [ca]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-ca">Wikiquote [ca]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [ca]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-ca">Wikisource [ca]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [ca]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-ca">Wikivoyage [ca]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_ca</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [ca]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ce/index.html b/datasets/language_ce/index.html
new file mode 100644
index 0000000..d7c8fe5
--- /dev/null
+++ b/datasets/language_ce/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ce/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Chechen Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#chechen-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Chechen Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ce-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ce; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ce-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ce; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ce-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ce; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ce-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ce; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ce-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ce; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ce-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ce; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ce-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ce; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ce-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ce; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ce-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ce; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ce-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ce; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ce-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ce; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ce-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ce; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="chechen-datasets">Chechen Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Chechen language.</p>
+<h2 id="colossal-oscar-1-ce-2015-14">Colossal OSCAR 1 [ce; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ce</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ce; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ce-2016-40">Colossal OSCAR 1 [ce; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ce</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ce; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ce-2017-43">Colossal OSCAR 1 [ce; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ce</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ce; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ce-2018-47">Colossal OSCAR 1 [ce; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ce</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ce; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ce-2019-22">Colossal OSCAR 1 [ce; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ce</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ce; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ce-2020-24">Colossal OSCAR 1 [ce; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ce</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ce; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ce-2020-45">Colossal OSCAR 1 [ce; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ce</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ce; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ce-2021-49">Colossal OSCAR 1 [ce; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ce</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ce; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ce-2022-27">Colossal OSCAR 1 [ce; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ce</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ce; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ce-2022-49">Colossal OSCAR 1 [ce; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ce</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ce; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ce-2023-14">Colossal OSCAR 1 [ce; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ce</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ce; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ce-2023-23">Colossal OSCAR 1 [ce; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ce</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ce; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ceb/index.html b/datasets/language_ceb/index.html
new file mode 100644
index 0000000..5195c82
--- /dev/null
+++ b/datasets/language_ceb/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ceb/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Ceb Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#ceb-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Ceb Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ceb-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ceb; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ceb-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ceb; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ceb-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ceb; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ceb-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ceb; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ceb-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ceb; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ceb-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ceb; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ceb-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ceb; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ceb-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ceb; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ceb-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ceb; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ceb-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ceb; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ceb-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ceb; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ceb-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ceb; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="ceb-datasets">Ceb Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Ceb language.</p>
+<h2 id="colossal-oscar-1-ceb-2015-14">Colossal OSCAR 1 [ceb; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ceb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ceb; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ceb-2016-40">Colossal OSCAR 1 [ceb; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ceb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ceb; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ceb-2017-43">Colossal OSCAR 1 [ceb; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ceb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ceb; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ceb-2018-47">Colossal OSCAR 1 [ceb; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ceb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ceb; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ceb-2019-22">Colossal OSCAR 1 [ceb; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ceb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ceb; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ceb-2020-24">Colossal OSCAR 1 [ceb; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ceb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ceb; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ceb-2020-45">Colossal OSCAR 1 [ceb; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ceb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ceb; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ceb-2021-49">Colossal OSCAR 1 [ceb; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ceb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ceb; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ceb-2022-27">Colossal OSCAR 1 [ceb; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ceb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ceb; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ceb-2022-49">Colossal OSCAR 1 [ceb; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ceb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ceb; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ceb-2023-14">Colossal OSCAR 1 [ceb; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ceb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ceb; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ceb-2023-23">Colossal OSCAR 1 [ceb; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ceb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ceb; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ckb/index.html b/datasets/language_ckb/index.html
new file mode 100644
index 0000000..79c32a7
--- /dev/null
+++ b/datasets/language_ckb/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ckb/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Ckb Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#ckb-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Ckb Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ckb-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ckb; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ckb-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ckb; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ckb-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ckb; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ckb-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ckb; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ckb-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ckb; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ckb-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ckb; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ckb-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ckb; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ckb-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ckb; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ckb-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ckb; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ckb-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ckb; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ckb-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ckb; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ckb-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ckb; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="ckb-datasets">Ckb Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Ckb language.</p>
+<h2 id="colossal-oscar-1-ckb-2015-14">Colossal OSCAR 1 [ckb; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ckb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ckb; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ckb-2016-40">Colossal OSCAR 1 [ckb; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ckb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ckb; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ckb-2017-43">Colossal OSCAR 1 [ckb; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ckb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ckb; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ckb-2018-47">Colossal OSCAR 1 [ckb; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ckb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ckb; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ckb-2019-22">Colossal OSCAR 1 [ckb; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ckb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ckb; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ckb-2020-24">Colossal OSCAR 1 [ckb; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ckb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ckb; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ckb-2020-45">Colossal OSCAR 1 [ckb; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ckb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ckb; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ckb-2021-49">Colossal OSCAR 1 [ckb; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ckb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ckb; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ckb-2022-27">Colossal OSCAR 1 [ckb; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ckb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ckb; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ckb-2022-49">Colossal OSCAR 1 [ckb; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ckb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ckb; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ckb-2023-14">Colossal OSCAR 1 [ckb; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ckb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ckb; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ckb-2023-23">Colossal OSCAR 1 [ckb; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ckb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ckb; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_code/index.html b/datasets/language_code/index.html
new file mode 100644
index 0000000..2182a5a
--- /dev/null
+++ b/datasets/language_code/index.html
@@ -0,0 +1,4669 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_code/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Code Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#code-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Code Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_1" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_2" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_3" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_4" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_5" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_6" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_7" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_8" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_9" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_10" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_11" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_12" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_13" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_15" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_16" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_17" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_18" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_19" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_20" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_21" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_25" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_26" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_28" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_29" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_30" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_31" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_32" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_33" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_34" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_35" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_36" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_37" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_38" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_39" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_41" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_42" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_44" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_46" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_48" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_50" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_51" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_52" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_53" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_54" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_55" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_56" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_57" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_58" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_59" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_60" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_61" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_62" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_63" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_64" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_65" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_66" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_67" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_68" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_69" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_70" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_71" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_72" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_73" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_74" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_75" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_76" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_77" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_78" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_79" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_80" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_81" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_82" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_83" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_84" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_85" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_86" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_87" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_88" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_89" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_90" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#starcoder_91" class="md-nav__link">
+    <span class="md-ellipsis">
+      Starcoder
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="code-datasets">Code Datasets</h1>
+<p>There are in total 92 datasets with 250 B tokens in Code language.</p>
+<h2 id="starcoder">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_emacs-lisp</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_1">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_visual-basic</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_2">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_racket</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_3">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_json</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_4">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_common-lisp</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_5">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_vhdl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_6">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_r</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_7">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_javascript</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_8">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_coffeescript</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_9">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_verilog</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_10">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_python</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_11">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_java-server-pages</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_12">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_cmake</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_13">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_typescript</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_14">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_protocol-buffer</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_15">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_java</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_16">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_clojure</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_17">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_thrift</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_18">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_prolog</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_19">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_isabelle</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_20">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_cpp</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_21">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_c-sharp</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_22">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_julia</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_23">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_xslt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_24">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_elm</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_25">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_scala</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_26">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_literate-agda</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_27">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_elixir</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_28">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_sas</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_29">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_lean</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_30">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_dockerfile</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_31">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_zig</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_32">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_rust</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_33">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_kotlin</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_34">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_dart</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_35">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_yaml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_36">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_ruby</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_37">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_jupyter-structured-clean-dedup</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_38">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_cuda</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_39">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_yacc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_40">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_rmarkdown</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_41">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_jupyter-scripts-dedup-filtered</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_42">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_css</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_43">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_restructuredtext</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_44">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_tex</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_45">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_powershell</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_46">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_idris</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_47">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_applescript</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_48">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_sql</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_49">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_markdown</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_50">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_git-commits-cleaned</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_51">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_antlr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_52">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_sparql</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_53">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_maple</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_54">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_fortran</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_55">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_alloy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_56">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_solidity</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_57">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_makefile</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_58">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_f-sharp</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_59">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_agda</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_60">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_smalltalk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_61">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_lua</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_62">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_erlang</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_63">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_ada</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_64">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_shell</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_65">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_literate-haskell</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_66">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_github-issues-filtered-structured</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_67">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_mathematica</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_68">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_stan</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_69">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_assembly</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_70">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_c</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_71">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_tcsh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_72">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_php</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_73">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_html</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_74">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_bluespec</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_75">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_tcl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_76">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_perl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_77">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_haskell</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_78">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_batchfile</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_79">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_literate-coffeescript</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_80">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_systemverilog</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_81">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_groovy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_82">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_awk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_83">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_stata</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_84">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_ocaml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_85">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_go</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_86">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_augeas</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_87">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_standard-ml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_88">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_matlab</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_89">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_glsl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_90">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_pascal</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="starcoder_91">Starcoder</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>starcoder_scheme</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Starcoder</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/bigcode/starcoderdata]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed permissive liceses (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_cs/index.html b/datasets/language_cs/index.html
new file mode 100644
index 0000000..c81e45d
--- /dev/null
+++ b/datasets/language_cs/index.html
@@ -0,0 +1,1545 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_cs/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Czech Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#czech-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Czech Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cs-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cs; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cs-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cs; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cs-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cs; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cs-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cs; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cs-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cs; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cs-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cs; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cs-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cs; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cs-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cs; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cs-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cs; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cs-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cs; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cs-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cs; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cs-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cs; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#czech-english-parallel-corpus-10-czeng-10" class="md-nav__link">
+    <span class="md-ellipsis">
+      Czech-English Parallel Corpus 1.0 (CzEng 1.0)
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-cs" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [cs]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-cs" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [cs]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#syn-v9-large-corpus-of-written-czech" class="md-nav__link">
+    <span class="md-ellipsis">
+      SYN v9: large corpus of written Czech
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-cs" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [cs]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-cs" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [cs]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-cs" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [cs]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-cs" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [cs]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-cs" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [cs]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="czech-datasets">Czech Datasets</h1>
+<p>There are in total 21 datasets with 21 B tokens in Czech language.</p>
+<h2 id="colossal-oscar-1-cs-2015-14">Colossal OSCAR 1 [cs; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cs; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cs-2016-40">Colossal OSCAR 1 [cs; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cs; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cs-2017-43">Colossal OSCAR 1 [cs; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cs; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cs-2018-47">Colossal OSCAR 1 [cs; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cs; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cs-2019-22">Colossal OSCAR 1 [cs; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cs; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cs-2020-24">Colossal OSCAR 1 [cs; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cs; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cs-2020-45">Colossal OSCAR 1 [cs; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cs; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cs-2021-49">Colossal OSCAR 1 [cs; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cs; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cs-2022-27">Colossal OSCAR 1 [cs; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cs; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cs-2022-49">Colossal OSCAR 1 [cs; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cs; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cs-2023-14">Colossal OSCAR 1 [cs; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cs; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cs-2023-23">Colossal OSCAR 1 [cs; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cs; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>10 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="czech-english-parallel-corpus-10-czeng-10">Czech-English Parallel Corpus 1.0 (CzEng 1.0)</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>cs_en_parallel</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Czech-English Parallel Corpus 1.0 (CzEng 1.0)</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>CzEng 1.0 is the fourth release of a sentence-parallel Czech-English corpus compiled at the Institute of Formal and Applied Linguistics (ÚFAL) freely available for non-commercial research purposes. CzEng 1.0 contains 15 million parallel sentences (233 million English and 206 million Czech tokens) from seven different types of sources automatically annotated at surface and deep (a- and t-) layers of syntactic representation.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[http://hdl.handle.net/11234/1-1458]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0) (commercial use: False, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-cs">EurlexResources [cs]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [cs]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>5 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-cs">LegalMC4 [cs]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [cs]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="syn-v9-large-corpus-of-written-czech">SYN v9: large corpus of written Czech</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>syn_v9</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>SYN v9: large corpus of written Czech</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Corpus of contemporary written (printed) Czech sized 4.7 GW (i.e. 5.7 billion tokens). It covers mostly the 1990-2019 period and features rich metadata including detailed bibliographical information, text-type classification etc. SYN v9 contains a wide variety of text types (fiction, non-fiction, newspapers), but the newspapers prevail noticeably.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-4635]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Academic Use - Czech National Corpus (Shuffled Corpus Data) (commercial use: False, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>5 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-cs">Wikibooks [cs]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [cs]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-cs">Wikinews [cs]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [cs]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-cs">Wikipedia [cs]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [cs]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>273 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-cs">Wikiquote [cs]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [cs]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-cs">Wikisource [cs]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_cs</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [cs]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>76 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_cv/index.html b/datasets/language_cv/index.html
new file mode 100644
index 0000000..c2b24c9
--- /dev/null
+++ b/datasets/language_cv/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_cv/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Chuvash Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#chuvash-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Chuvash Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cv-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cv; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cv-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cv; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cv-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cv; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cv-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cv; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cv-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cv; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cv-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cv; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cv-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cv; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cv-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cv; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cv-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cv; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cv-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cv; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cv-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cv; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cv-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cv; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="chuvash-datasets">Chuvash Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Chuvash language.</p>
+<h2 id="colossal-oscar-1-cv-2015-14">Colossal OSCAR 1 [cv; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_cv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cv; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cv-2016-40">Colossal OSCAR 1 [cv; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_cv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cv; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cv-2017-43">Colossal OSCAR 1 [cv; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_cv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cv; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cv-2018-47">Colossal OSCAR 1 [cv; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_cv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cv; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cv-2019-22">Colossal OSCAR 1 [cv; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_cv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cv; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cv-2020-24">Colossal OSCAR 1 [cv; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_cv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cv; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cv-2020-45">Colossal OSCAR 1 [cv; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_cv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cv; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cv-2021-49">Colossal OSCAR 1 [cv; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_cv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cv; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cv-2022-27">Colossal OSCAR 1 [cv; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_cv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cv; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cv-2022-49">Colossal OSCAR 1 [cv; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_cv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cv; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cv-2023-14">Colossal OSCAR 1 [cv; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_cv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cv; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cv-2023-23">Colossal OSCAR 1 [cv; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_cv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cv; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_cy/index.html b/datasets/language_cy/index.html
new file mode 100644
index 0000000..a81eedc
--- /dev/null
+++ b/datasets/language_cy/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_cy/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Welsh Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#welsh-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Welsh Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cy-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cy; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cy-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cy; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cy-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cy; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cy-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cy; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cy-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cy; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cy-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cy; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cy-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cy; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cy-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cy; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cy-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cy; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cy-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cy; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cy-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cy; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-cy-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [cy; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="welsh-datasets">Welsh Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Welsh language.</p>
+<h2 id="colossal-oscar-1-cy-2015-14">Colossal OSCAR 1 [cy; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_cy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cy; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cy-2016-40">Colossal OSCAR 1 [cy; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_cy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cy; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cy-2017-43">Colossal OSCAR 1 [cy; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_cy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cy; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cy-2018-47">Colossal OSCAR 1 [cy; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_cy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cy; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cy-2019-22">Colossal OSCAR 1 [cy; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_cy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cy; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cy-2020-24">Colossal OSCAR 1 [cy; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_cy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cy; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cy-2020-45">Colossal OSCAR 1 [cy; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_cy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cy; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cy-2021-49">Colossal OSCAR 1 [cy; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_cy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cy; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cy-2022-27">Colossal OSCAR 1 [cy; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_cy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cy; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cy-2022-49">Colossal OSCAR 1 [cy; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_cy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cy; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cy-2023-14">Colossal OSCAR 1 [cy; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_cy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cy; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-cy-2023-23">Colossal OSCAR 1 [cy; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_cy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [cy; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_da/index.html b/datasets/language_da/index.html
new file mode 100644
index 0000000..d5419a1
--- /dev/null
+++ b/datasets/language_da/index.html
@@ -0,0 +1,1545 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_da/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Danish Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#danish-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Danish Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-da-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [da; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-da-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [da; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-da-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [da; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-da-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [da; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-da-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [da; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-da-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [da; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-da-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [da; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-da-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [da; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-da-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [da; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-da-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [da; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-da-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [da; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-da-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [da; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#dk-clarin-reference-corpus-of-general-danish" class="md-nav__link">
+    <span class="md-ellipsis">
+      DK-CLARIN Reference Corpus of General Danish
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#danewsroom" class="md-nav__link">
+    <span class="md-ellipsis">
+      DaNewsroom
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#danish-gigaword" class="md-nav__link">
+    <span class="md-ellipsis">
+      Danish GigaWord
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-da" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [da]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-da" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [da]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-da" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [da]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-da" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [da]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-da" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [da]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-da" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [da]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="danish-datasets">Danish Datasets</h1>
+<p>There are in total 21 datasets with 11 B tokens in Danish language.</p>
+<h2 id="colossal-oscar-1-da-2015-14">Colossal OSCAR 1 [da; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [da; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-da-2016-40">Colossal OSCAR 1 [da; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [da; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-da-2017-43">Colossal OSCAR 1 [da; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [da; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-da-2018-47">Colossal OSCAR 1 [da; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [da; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-da-2019-22">Colossal OSCAR 1 [da; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [da; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-da-2020-24">Colossal OSCAR 1 [da; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [da; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-da-2020-45">Colossal OSCAR 1 [da; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [da; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-da-2021-49">Colossal OSCAR 1 [da; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [da; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-da-2022-27">Colossal OSCAR 1 [da; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [da; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-da-2022-49">Colossal OSCAR 1 [da; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [da; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-da-2023-14">Colossal OSCAR 1 [da; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [da; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-da-2023-23">Colossal OSCAR 1 [da; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [da; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="dk-clarin-reference-corpus-of-general-danish">DK-CLARIN Reference Corpus of General Danish</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>dk_clarin</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>DK-CLARIN Reference Corpus of General Danish</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Reference Corpus of General Danish</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://korpus.dsl.dk/clarin/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Academic Use; CLARIN-ACA-NC (commercial use: False, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>441 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="danewsroom">DaNewsroom</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>danewsroom</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>DaNewsroom</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Large-scale Danish Summarisation Dataset</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>on_request</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://github.com/danielvarab/da-newsroom]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>research-only (unknown license) (commercial use: False, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>472 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="danish-gigaword">Danish GigaWord</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>danish_gigaword</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Danish GigaWord</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A billion-word corpus of Danish text. Split into many sections, and covering many dimensions of variation (spoken/written, formal/informal, modern/old, rigsdansk/dialect, and so on).The license is CC-BY 4.0, Creative Commons with Attribution. Owners: ITU; Leon Derczynski, Manuel R. Ciosici</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://sprogteknologi.dk/dataset/danish-gigaword]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC-BY 4.0, Creative Commons with Attribution (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-da">EurlexResources [da]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [da]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>7 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-da">LegalMC4 [da]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [da]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>10 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-da">Wikibooks [da]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [da]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>6 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-da">Wikipedia [da]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [da]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>66 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-da">Wikiquote [da]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [da]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>303 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-da">Wikisource [da]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_da</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [da]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>6 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_de/index.html b/datasets/language_de/index.html
new file mode 100644
index 0000000..72f0a52
--- /dev/null
+++ b/datasets/language_de/index.html
@@ -0,0 +1,1589 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_de/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>German Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#german-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              German Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-de-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [de; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-de-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [de; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-de-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [de; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-de-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [de; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-de-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [de; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-de-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [de; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-de-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [de; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-de-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [de; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-de-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [de; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-de-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [de; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-de-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [de; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-de-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [de; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#dewac" class="md-nav__link">
+    <span class="md-ellipsis">
+      DeWaC
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-de" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [de]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-de" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [de]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#open-legal-data-german-court-decisions-and-laws" class="md-nav__link">
+    <span class="md-ellipsis">
+      Open Legal Data - German court decisions and laws
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-de" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [de]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-de" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [de]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-de" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [de]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-de" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [de]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-de" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [de]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-de" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [de]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="german-datasets">German Datasets</h1>
+<p>There are in total 22 datasets with 26 B tokens in German language.</p>
+<h2 id="colossal-oscar-1-de-2015-14">Colossal OSCAR 1 [de; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [de; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-de-2016-40">Colossal OSCAR 1 [de; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [de; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-de-2017-43">Colossal OSCAR 1 [de; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [de; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-de-2018-47">Colossal OSCAR 1 [de; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [de; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-de-2019-22">Colossal OSCAR 1 [de; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [de; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-de-2020-24">Colossal OSCAR 1 [de; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [de; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-de-2020-45">Colossal OSCAR 1 [de; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [de; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-de-2021-49">Colossal OSCAR 1 [de; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [de; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-de-2022-27">Colossal OSCAR 1 [de; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [de; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-de-2022-49">Colossal OSCAR 1 [de; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [de; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-de-2023-14">Colossal OSCAR 1 [de; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [de; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-de-2023-23">Colossal OSCAR 1 [de; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [de; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="dewac">DeWaC</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>dewac</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>DeWaC</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>DeWaC is a 1.7 billion word corpus constructed from the Web limiting the crawl to the .de domain and using medium-frequency words from the SudDeutsche Zeitung corpus and basic German vocabulary lists as seeds.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>on_request</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://docs.sslmit.unibo.it/doku.php?id=corpora:dewac]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>unknown license; likely fair-use / research-only (commercial use: None, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-de">EurlexResources [de]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [de]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>7 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-de">LegalMC4 [de]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [de]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>6 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="open-legal-data-german-court-decisions-and-laws">Open Legal Data - German court decisions and laws</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>openlegaldata</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Open Legal Data - German court decisions and laws</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>OPENLEGALDATA.IO is a free and open platform that makes legal documents and information accessible to the public.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>on_request</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://openlegaldata.io/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>public domain (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>10 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-de">Wikibooks [de]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [de]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>50 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-de">Wikinews [de]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [de]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>9 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-de">Wikipedia [de]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [de]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-de">Wikiquote [de]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [de]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-de">Wikisource [de]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [de]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>156 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-de">Wikivoyage [de]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_de</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [de]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>29 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_dsb/index.html b/datasets/language_dsb/index.html
new file mode 100644
index 0000000..fdae425
--- /dev/null
+++ b/datasets/language_dsb/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_dsb/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Dsb Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#dsb-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Dsb Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dsb-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dsb; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dsb-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dsb; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dsb-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dsb; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dsb-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dsb; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dsb-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dsb; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dsb-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dsb; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dsb-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dsb; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dsb-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dsb; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dsb-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dsb; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dsb-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dsb; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dsb-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dsb; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dsb-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dsb; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="dsb-datasets">Dsb Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Dsb language.</p>
+<h2 id="colossal-oscar-1-dsb-2015-14">Colossal OSCAR 1 [dsb; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_dsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dsb; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dsb-2016-40">Colossal OSCAR 1 [dsb; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_dsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dsb; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dsb-2017-43">Colossal OSCAR 1 [dsb; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_dsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dsb; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dsb-2018-47">Colossal OSCAR 1 [dsb; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_dsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dsb; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dsb-2019-22">Colossal OSCAR 1 [dsb; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_dsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dsb; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dsb-2020-24">Colossal OSCAR 1 [dsb; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_dsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dsb; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dsb-2020-45">Colossal OSCAR 1 [dsb; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_dsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dsb; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dsb-2021-49">Colossal OSCAR 1 [dsb; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_dsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dsb; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dsb-2022-27">Colossal OSCAR 1 [dsb; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_dsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dsb; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dsb-2022-49">Colossal OSCAR 1 [dsb; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_dsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dsb; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dsb-2023-14">Colossal OSCAR 1 [dsb; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_dsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dsb; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dsb-2023-23">Colossal OSCAR 1 [dsb; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_dsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dsb; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_dv/index.html b/datasets/language_dv/index.html
new file mode 100644
index 0000000..87e5479
--- /dev/null
+++ b/datasets/language_dv/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_dv/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Dhivehi Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#dhivehi-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Dhivehi Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dv-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dv; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dv-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dv; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dv-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dv; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dv-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dv; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dv-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dv; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dv-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dv; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dv-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dv; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dv-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dv; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dv-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dv; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dv-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dv; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dv-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dv; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-dv-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [dv; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="dhivehi-datasets">Dhivehi Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Dhivehi language.</p>
+<h2 id="colossal-oscar-1-dv-2015-14">Colossal OSCAR 1 [dv; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_dv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dv; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dv-2016-40">Colossal OSCAR 1 [dv; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_dv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dv; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dv-2017-43">Colossal OSCAR 1 [dv; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_dv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dv; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dv-2018-47">Colossal OSCAR 1 [dv; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_dv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dv; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dv-2019-22">Colossal OSCAR 1 [dv; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_dv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dv; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dv-2020-24">Colossal OSCAR 1 [dv; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_dv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dv; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dv-2020-45">Colossal OSCAR 1 [dv; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_dv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dv; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dv-2021-49">Colossal OSCAR 1 [dv; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_dv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dv; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dv-2022-27">Colossal OSCAR 1 [dv; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_dv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dv; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dv-2022-49">Colossal OSCAR 1 [dv; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_dv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dv; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dv-2023-14">Colossal OSCAR 1 [dv; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_dv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dv; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-dv-2023-23">Colossal OSCAR 1 [dv; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_dv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [dv; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_el/index.html b/datasets/language_el/index.html
new file mode 100644
index 0000000..a7a84ca
--- /dev/null
+++ b/datasets/language_el/index.html
@@ -0,0 +1,1633 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_el/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Greek Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#greek-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Greek Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-el-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [el; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-el-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [el; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-el-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [el; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-el-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [el; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-el-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [el; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-el-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [el; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-el-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [el; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-el-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [el; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-el-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [el; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-el-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [el; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-el-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [el; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-el-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [el; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-el" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [el]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#greek-legal-code" class="md-nav__link">
+    <span class="md-ellipsis">
+      Greek Legal Code
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#greek-web-corpus" class="md-nav__link">
+    <span class="md-ellipsis">
+      Greek Web Corpus
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-el" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [el]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#macocu-web-corpus-greek-10" class="md-nav__link">
+    <span class="md-ellipsis">
+      MaCoCu web corpus [Greek 1.0]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-el" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [el]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-el" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [el]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-el" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [el]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-el" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [el]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-el" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [el]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-el" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [el]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="greek-datasets">Greek Datasets</h1>
+<p>There are in total 23 datasets with 24 B tokens in Greek language.</p>
+<h2 id="colossal-oscar-1-el-2015-14">Colossal OSCAR 1 [el; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [el; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-el-2016-40">Colossal OSCAR 1 [el; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [el; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-el-2017-43">Colossal OSCAR 1 [el; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [el; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-el-2018-47">Colossal OSCAR 1 [el; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [el; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-el-2019-22">Colossal OSCAR 1 [el; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [el; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-el-2020-24">Colossal OSCAR 1 [el; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [el; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-el-2020-45">Colossal OSCAR 1 [el; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [el; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-el-2021-49">Colossal OSCAR 1 [el; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [el; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-el-2022-27">Colossal OSCAR 1 [el; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [el; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-el-2022-49">Colossal OSCAR 1 [el; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [el; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-el-2023-14">Colossal OSCAR 1 [el; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [el; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-el-2023-23">Colossal OSCAR 1 [el; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [el; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>8 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-el">EurlexResources [el]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [el]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>7 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="greek-legal-code">Greek Legal Code</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>greek_legal_code</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Greek Legal Code</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Greek_Legal_Code (GLC) is a dataset consisting of approx. 47k legal resources from Greek legislation. The origin of GLC is “Permanent Greek Legislation Code - Raptarchis”, a collection of Greek legislative  documents classified into multi-level (from broader to more specialized) categories.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/greek_legal_code]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>unknown; likely publlic domain (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>45 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="greek-web-corpus">Greek Web Corpus</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>greek_web_corpus</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Greek Web Corpus</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A corpus of the Greek Web used for training <code>GreekBART: The First Pretrained Greek Sequence-to-Sequence Model</code></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>on_request</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[http://nlp.polytechnique.fr/resources-greek]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>unknown; likely fair use (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-el">LegalMC4 [el]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [el]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="macocu-web-corpus-greek-10">MaCoCu web corpus [Greek 1.0]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>macocu_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>MaCoCu web corpus [Greek 1.0]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://www.clarin.si/repository/xmlui/handle/11356/1839]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC0-No Rights Reserved (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-el">Wikibooks [el]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [el]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>19 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-el">Wikinews [el]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [el]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-el">Wikipedia [el]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [el]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>584 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-el">Wikiquote [el]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [el]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>5 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-el">Wikisource [el]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [el]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>164 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-el">Wikivoyage [el]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_el</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [el]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_en/index.html b/datasets/language_en/index.html
new file mode 100644
index 0000000..6edcabc
--- /dev/null
+++ b/datasets/language_en/index.html
@@ -0,0 +1,2837 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_en/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>English Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#english-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              English Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#auxiliary-mathematics-problems-and-solutions-amps-dataset" class="md-nav__link">
+    <span class="md-ellipsis">
+      Auxiliary Mathematics Problems and Solutions (AMPS) dataset
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-en-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [en; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-en-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [en; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-en-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [en; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-en-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [en; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-en-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [en; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-en-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [en; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-en-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [en; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-en-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [en; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-en-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [en; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-en-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [en; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-en-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [en; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-en-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [en; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#edgarcorpus" class="md-nav__link">
+    <span class="md-ellipsis">
+      EdgarCorpus
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-en" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [en]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-en" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [en]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-selected-subsets" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law (selected subsets)
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-selected-subsets_1" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law (selected subsets)
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-acus_reports" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [acus_reports]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-atticus_contracts" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [atticus_contracts]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-cc_casebooks" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [cc_casebooks]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-cfpb_creditcard_contracts" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [cfpb_creditcard_contracts]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-congressional_hearings" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [congressional_hearings]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-constitutions" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [constitutions]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-courtlistener_docket_entry_documents" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [courtlistener_docket_entry_documents]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-courtlistener_opinions" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [courtlistener_opinions]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-doj_guidance_documents" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [doj_guidance_documents]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-echr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [echr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-ed_policy_guidance" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [ed_policy_guidance]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-exam_outlines" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [exam_outlines]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-icj-pcij" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [icj-pcij]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-medicaid_policy_guidance" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [medicaid_policy_guidance]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-r_legaladvice" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [r_legaladvice]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-resource_contracts" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [resource_contracts]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-scotus_oral_arguments" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [scotus_oral_arguments]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-tos" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [tos]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-un_debates" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [un_debates]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pile-of-law-uspto_office_actions" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pile of Law [uspto_office_actions]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#redpajama-data-t1-selected-subsets" class="md-nav__link">
+    <span class="md-ellipsis">
+      RedPajama-Data T1 (selected subsets)
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#redpajama-data-t1-selected-subsets_1" class="md-nav__link">
+    <span class="md-ellipsis">
+      RedPajama-Data T1 (selected subsets)
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wura-english" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [English]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikihow" class="md-nav__link">
+    <span class="md-ellipsis">
+      WikiHow
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-en" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [en]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-en" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [en]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-en" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [en]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-en" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [en]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-en" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [en]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-en" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [en]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pes2o" class="md-nav__link">
+    <span class="md-ellipsis">
+      peS2o
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#proof-pile" class="md-nav__link">
+    <span class="md-ellipsis">
+      proof-pile
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="english-datasets">English Datasets</h1>
+<p>There are in total 50 datasets with 117 B tokens in English language.</p>
+<h2 id="auxiliary-mathematics-problems-and-solutions-amps-dataset">Auxiliary Mathematics Problems and Solutions (AMPS) dataset</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>math_amps</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Auxiliary Mathematics Problems and Solutions (AMPS) dataset</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Our pretraining dataset, the Auxiliary Mathematics Problems and Solutions (AMPS) dataset, has problems and step-by-step solutions typeset  in LATEX. AMPS contains over 100,000 problems pulled from Khan Academy and  approximately 5 million problems generated from manually designed Mathematica scripts. Problems include various aspects of algebra, calculus, counting and statistics, geometry, linear algebra, and number theory.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://github.com/hendrycks/math]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>repository license is MIT; no specific data license (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-en-2015-14">Colossal OSCAR 1 [en; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [en; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-en-2016-40">Colossal OSCAR 1 [en; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [en; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-en-2017-43">Colossal OSCAR 1 [en; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [en; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-en-2018-47">Colossal OSCAR 1 [en; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [en; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-en-2019-22">Colossal OSCAR 1 [en; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [en; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-en-2020-24">Colossal OSCAR 1 [en; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [en; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-en-2020-45">Colossal OSCAR 1 [en; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [en; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-en-2021-49">Colossal OSCAR 1 [en; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [en; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-en-2022-27">Colossal OSCAR 1 [en; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [en; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-en-2022-49">Colossal OSCAR 1 [en; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [en; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-en-2023-14">Colossal OSCAR 1 [en; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [en; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-en-2023-23">Colossal OSCAR 1 [en; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [en; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="edgarcorpus">EdgarCorpus</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>edgarcorpus</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EdgarCorpus</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset contains annual filings (10K) of all publicly</td>
+</tr>
+<tr>
+<td>traded firms from 1993-2020. The table data is stripped but all text is retained.</td>
+<td></td>
+</tr>
+<tr>
+<td>This dataset allows easy access to the EDGAR-CORPUS dataset based on the paper</td>
+<td></td>
+</tr>
+<tr>
+<td>EDGAR-CORPUS: Billions of Tokens Make The World Go Round.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/eloukas/edgar-corpus]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>7 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-en">EurlexResources [en]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [en]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>8 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-en">LegalMC4 [en]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [en]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>967 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-selected-subsets">Pile of Law (selected subsets)</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law (selected subsets)</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CreativeCommons Attribution-NonCommercial-ShareAlike 4.0 International. But individual sources may have other licenses. See paper for details. (commercial use: False, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-selected-subsets_1">Pile of Law (selected subsets)</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law (selected subsets)</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CreativeCommons Attribution-NonCommercial-ShareAlike 4.0 International. But individual sources may have other licenses. See paper for details. (commercial use: False, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-acus_reports">Pile of Law [acus_reports]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_acus_reports</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [acus_reports]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>None</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-atticus_contracts">Pile of Law [atticus_contracts]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_atticus_contracts</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [atticus_contracts]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC BY 4.0 (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-cc_casebooks">Pile of Law [cc_casebooks]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_cc_casebooks</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [cc_casebooks]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Mixed; Most restrictive: CC BY-NC-SA 4.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-cfpb_creditcard_contracts">Pile of Law [cfpb_creditcard_contracts]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_cfpb_creditcard_contracts</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [cfpb_creditcard_contracts]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Publicly available, unknown license. Assumed to be governed by fair use standards. (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-congressional_hearings">Pile of Law [congressional_hearings]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_congressional_hearings</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [congressional_hearings]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Public domain (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-constitutions">Pile of Law [constitutions]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_constitutions</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [constitutions]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC BY-NC 3.0 (commercial use: False, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-courtlistener_docket_entry_documents">Pile of Law [courtlistener_docket_entry_documents]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_courtlistener_docket_entry_documents</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [courtlistener_docket_entry_documents]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Underlying content is Public Domain. (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-courtlistener_opinions">Pile of Law [courtlistener_opinions]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_courtlistener_opinions</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [courtlistener_opinions]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Public domain (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-doj_guidance_documents">Pile of Law [doj_guidance_documents]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_doj_guidance_documents</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [doj_guidance_documents]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>None</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-echr">Pile of Law [echr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_echr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [echr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Non-commercial, commercial use requires written permission (commercial use: False, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-ed_policy_guidance">Pile of Law [ed_policy_guidance]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_ed_policy_guidance</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [ed_policy_guidance]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>None</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-exam_outlines">Pile of Law [exam_outlines]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_exam_outlines</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [exam_outlines]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Publicly available, unknown license. Assumed to be governed by fair use standards. (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-icj-pcij">Pile of Law [icj-pcij]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_icj-pcij</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [icj-pcij]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>None</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-medicaid_policy_guidance">Pile of Law [medicaid_policy_guidance]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_medicaid_policy_guidance</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [medicaid_policy_guidance]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>None</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-r_legaladvice">Pile of Law [r_legaladvice]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_r_legaladvice</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [r_legaladvice]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-resource_contracts">Pile of Law [resource_contracts]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_resource_contracts</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [resource_contracts]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>None</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-scotus_oral_arguments">Pile of Law [scotus_oral_arguments]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_scotus_oral_arguments</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [scotus_oral_arguments]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Public domain (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-tos">Pile of Law [tos]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_tos</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [tos]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Publicly available, unknown license. Assumed to be governed by fair use standards. (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-un_debates">Pile of Law [un_debates]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_un_debates</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [un_debates]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Public domain (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pile-of-law-uspto_office_actions">Pile of Law [uspto_office_actions]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pile_of_law_uspto_office_actions</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Pile of Law [uspto_office_actions]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/pile-of-law/pile-of-law]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>None</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="redpajama-data-t1-selected-subsets">RedPajama-Data T1 (selected subsets)</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>redpajama_book</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>RedPajama-Data T1 (selected subsets)</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>An Open Source Recipe to Reproduce LLaMA training dataset</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>partially copyrighted/pirated (commercial use: False, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>26 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="redpajama-data-t1-selected-subsets_1">RedPajama-Data T1 (selected subsets)</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>redpajama_stackexchange</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>RedPajama-Data T1 (selected subsets)</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>An Open Source Recipe to Reproduce LLaMA training dataset</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>cc-by-sa 4.0 (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>20 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wura-english">WURA [English]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [English]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikihow">WikiHow</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikihow</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WikiHow</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>WikiHow is a new large-scale dataset using the online WikiHow</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://github.com/mahnazkoupaee/WikiHow-Dataset]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC BY-NC-SA 3.0 (commercial use: False, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-en">Wikibooks [en]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [en]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>129 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-en">Wikinews [en]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [en]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>15 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-en">Wikipedia [en]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [en]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-en">Wikiquote [en]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [en]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>125 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-en">Wikisource [en]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [en]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>731 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-en">Wikivoyage [en]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [en]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>48 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="pes2o">peS2o</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pes2o</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>peS2o</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The peS2o dataset is a collection of ~40M creative open-access academic papers, cleaned, filtered, and formatted for pre-training of language models. It is derived from the Semantic Scholar Open Research Corpus(Lo et al, 2020), or S2ORC.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/allenai/peS2o]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Open Data Commons Attribution License (ODC-By) v1.0 (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>42 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="proof-pile">proof-pile</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>proof_pile</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>proof-pile</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The proof-pile is a 13GB pre-training dataset of mathematical text that comprises 8.3 billion tokens (using the gpt-neox tokenizer). The dataset is composed of diverse sources of both informal and formal mathematics, namely</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/hoskinson-center/proof-pile]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache 2.0 (probably code license instead of data license) (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>8 B</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_eo/index.html b/datasets/language_eo/index.html
new file mode 100644
index 0000000..571a094
--- /dev/null
+++ b/datasets/language_eo/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_eo/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Esperanto Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#esperanto-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Esperanto Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eo-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eo; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eo-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eo; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eo-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eo; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eo-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eo; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eo-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eo; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eo-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eo; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eo-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eo; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eo-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eo; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eo-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eo; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eo-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eo; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eo-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eo; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eo-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eo; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="esperanto-datasets">Esperanto Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Esperanto language.</p>
+<h2 id="colossal-oscar-1-eo-2015-14">Colossal OSCAR 1 [eo; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_eo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eo; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eo-2016-40">Colossal OSCAR 1 [eo; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_eo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eo; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eo-2017-43">Colossal OSCAR 1 [eo; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_eo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eo; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eo-2018-47">Colossal OSCAR 1 [eo; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_eo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eo; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eo-2019-22">Colossal OSCAR 1 [eo; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_eo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eo; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eo-2020-24">Colossal OSCAR 1 [eo; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_eo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eo; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eo-2020-45">Colossal OSCAR 1 [eo; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_eo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eo; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eo-2021-49">Colossal OSCAR 1 [eo; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_eo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eo; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eo-2022-27">Colossal OSCAR 1 [eo; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_eo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eo; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eo-2022-49">Colossal OSCAR 1 [eo; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_eo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eo; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eo-2023-14">Colossal OSCAR 1 [eo; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_eo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eo; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eo-2023-23">Colossal OSCAR 1 [eo; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_eo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eo; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_es/index.html b/datasets/language_es/index.html
new file mode 100644
index 0000000..62d042f
--- /dev/null
+++ b/datasets/language_es/index.html
@@ -0,0 +1,1545 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_es/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Spanish Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#spanish-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Spanish Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-es-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [es; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-es-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [es; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-es-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [es; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-es-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [es; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-es-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [es; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-es-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [es; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-es-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [es; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-es-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [es; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-es-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [es; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-es-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [es; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-es-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [es; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-es-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [es; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-es" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [es]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-es" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [es]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#spanish-legal-domain-corpora" class="md-nav__link">
+    <span class="md-ellipsis">
+      Spanish Legal Domain Corpora
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-es" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [es]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-es" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [es]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-es" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [es]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-es" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [es]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-es" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [es]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-es" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [es]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="spanish-datasets">Spanish Datasets</h1>
+<p>There are in total 21 datasets with 20 B tokens in Spanish language.</p>
+<h2 id="colossal-oscar-1-es-2015-14">Colossal OSCAR 1 [es; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [es; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-es-2016-40">Colossal OSCAR 1 [es; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [es; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-es-2017-43">Colossal OSCAR 1 [es; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [es; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-es-2018-47">Colossal OSCAR 1 [es; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [es; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-es-2019-22">Colossal OSCAR 1 [es; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [es; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-es-2020-24">Colossal OSCAR 1 [es; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [es; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-es-2020-45">Colossal OSCAR 1 [es; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [es; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-es-2021-49">Colossal OSCAR 1 [es; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [es; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-es-2022-27">Colossal OSCAR 1 [es; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [es; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-es-2022-49">Colossal OSCAR 1 [es; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [es; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-es-2023-14">Colossal OSCAR 1 [es; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [es; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-es-2023-23">Colossal OSCAR 1 [es; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [es; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-es">EurlexResources [es]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [es]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>7 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-es">LegalMC4 [es]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [es]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>9 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="spanish-legal-domain-corpora">Spanish Legal Domain Corpora</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>spanish_legal</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Spanish Legal Domain Corpora</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A collection of corpora of Spanish legal domain.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://github.com/PlanTL-GOB-ES/lm-legal-es]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-es">Wikibooks [es]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [es]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>24 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-es">Wikinews [es]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [es]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>7 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-es">Wikipedia [es]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [es]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-es">Wikiquote [es]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [es]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>5 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-es">Wikisource [es]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [es]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>112 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-es">Wikivoyage [es]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_es</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [es]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>14 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_et/index.html b/datasets/language_et/index.html
new file mode 100644
index 0000000..22115a3
--- /dev/null
+++ b/datasets/language_et/index.html
@@ -0,0 +1,1545 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_et/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Estonian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#estonian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Estonian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-et-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [et; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-et-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [et; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-et-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [et; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-et-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [et; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-et-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [et; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-et-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [et; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-et-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [et; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-et-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [et; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-et-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [et; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-et-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [et; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-et-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [et; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-et-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [et; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ekspress-news-article-archive-only-estonian-10" class="md-nav__link">
+    <span class="md-ellipsis">
+      Ekspress news article archive (only Estonian) 1.0
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#estonian-national-corpus-2021" class="md-nav__link">
+    <span class="md-ellipsis">
+      Estonian National Corpus 2021
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#estonian-reference-corpus" class="md-nav__link">
+    <span class="md-ellipsis">
+      Estonian Reference Corpus
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-et" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [et]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-et" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [et]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-et" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [et]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-et" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [et]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-et" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [et]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-et" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [et]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="estonian-datasets">Estonian Datasets</h1>
+<p>There are in total 21 datasets with 5 B tokens in Estonian language.</p>
+<h2 id="colossal-oscar-1-et-2015-14">Colossal OSCAR 1 [et; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [et; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-et-2016-40">Colossal OSCAR 1 [et; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [et; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-et-2017-43">Colossal OSCAR 1 [et; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [et; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-et-2018-47">Colossal OSCAR 1 [et; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [et; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-et-2019-22">Colossal OSCAR 1 [et; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [et; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-et-2020-24">Colossal OSCAR 1 [et; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [et; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-et-2020-45">Colossal OSCAR 1 [et; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [et; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-et-2021-49">Colossal OSCAR 1 [et; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [et; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-et-2022-27">Colossal OSCAR 1 [et; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [et; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-et-2022-49">Colossal OSCAR 1 [et; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [et; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-et-2023-14">Colossal OSCAR 1 [et; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [et; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-et-2023-23">Colossal OSCAR 1 [et; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [et; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="ekspress-news-article-archive-only-estonian-10">Ekspress news article archive (only Estonian) 1.0</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>ekspress</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Ekspress news article archive (only Estonian) 1.0</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The dataset is an archive of articles from the Ekspress Meedia news site from 2009-2019, containing over 1.4M articles, mostly in Estonian language (1,115,120 articles) with some  in Russian (325,952 articles).</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://www.clarin.si/repository/xmlui/handle/11356/1408]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons - Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) (commercial use: False, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="estonian-national-corpus-2021">Estonian National Corpus 2021</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>enc2021</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Estonian National Corpus 2021</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Corpus is based on Estonian National Corpus 2013, which was renewed by Lexical Computing Ltd. in 2017 and 2019 at the request of Estonian Language Institute.Subcorpora are: Estonian Reference Corpus 1990-2008, Estonian Web 2013, Estonian Web 2017, Estonian Web 2019, Estonian Wikipedia 2017, Estonian Wikipedia 2019, Estonian Open Access Journals (DOAJ), blogs, discussion, education, fiction, food, health, journals, news, religion, science, sex, society, sports.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://entu.keeleressursid.ee/shared/9939/EVKultjxSeFA2QhFkbE7fGGDGNT1zmJLOUGFK9hw53tq9Rx2YBTejI1IoKhy65zq]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-NonCommercial 4.0 International License (commercial use: False, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="estonian-reference-corpus">Estonian Reference Corpus</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>estonian_reference_corpus</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Estonian Reference Corpus</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>This corpus includes Estonian texts (fiction, PhD theses, newspapers, magazines, parliamentary transcriptions, computer-mediated communication) published between 1990 and 2007. The corpus is encoded in TEI. The corpus is available for online browsing through a dedicated concordancer and is available for download from CELR.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://www.cl.ut.ee/korpused/segakorpus/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>free for non-commercial use (commercial use: False, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>175 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-et">EurlexResources [et]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [et]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-et">LegalMC4 [et]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [et]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>110 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-et">Wikibooks [et]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [et]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-et">Wikipedia [et]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [et]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>61 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-et">Wikiquote [et]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [et]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>13 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-et">Wikisource [et]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_et</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [et]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_eu/index.html b/datasets/language_eu/index.html
new file mode 100644
index 0000000..0aed0dc
--- /dev/null
+++ b/datasets/language_eu/index.html
@@ -0,0 +1,1457 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_eu/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Basque Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#basque-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Basque Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eu-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eu; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eu-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eu; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eu-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eu; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eu-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eu; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eu-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eu; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eu-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eu; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eu-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eu; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eu-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eu; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eu-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eu; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eu-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eu; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eu-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eu; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-eu-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [eu; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#euscrawl" class="md-nav__link">
+    <span class="md-ellipsis">
+      EusCrawl
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#euscrawl-filtered-no-wikipedia-no-nc-licenses" class="md-nav__link">
+    <span class="md-ellipsis">
+      EusCrawl (filtered: no Wikipedia, no NC-licenses)
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-eu" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [eu]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-eu" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [eu]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-eu" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [eu]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-eu" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [eu]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-eu" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [eu]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="basque-datasets">Basque Datasets</h1>
+<p>There are in total 19 datasets with 982 M tokens in Basque language.</p>
+<h2 id="colossal-oscar-1-eu-2015-14">Colossal OSCAR 1 [eu; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eu; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eu-2016-40">Colossal OSCAR 1 [eu; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eu; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eu-2017-43">Colossal OSCAR 1 [eu; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eu; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eu-2018-47">Colossal OSCAR 1 [eu; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eu; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eu-2019-22">Colossal OSCAR 1 [eu; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eu; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eu-2020-24">Colossal OSCAR 1 [eu; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eu; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eu-2020-45">Colossal OSCAR 1 [eu; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eu; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eu-2021-49">Colossal OSCAR 1 [eu; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eu; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eu-2022-27">Colossal OSCAR 1 [eu; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eu; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eu-2022-49">Colossal OSCAR 1 [eu; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eu; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eu-2023-14">Colossal OSCAR 1 [eu; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eu; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-eu-2023-23">Colossal OSCAR 1 [eu; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [eu; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>136 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="euscrawl">EusCrawl</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>euscrawl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EusCrawl</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>EusCrawl (http://www.ixa.eus/euscrawl/) is a high-quality corpus for Basque comprising 12.5 million documents and 423 million tokens, totalling 2.1 GiB of uncompressed text. EusCrawl was built using ad-hoc scrapers to extract text from 33 Basque websites with high-quality content, resulting in cleaner text compared to general purpose approaches.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[None]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed (see Tab. 2 in paper, e.g., CC-BY-NC-ND, CC-BY-NC-SA) (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>423 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="euscrawl-filtered-no-wikipedia-no-nc-licenses">EusCrawl (filtered: no Wikipedia, no NC-licenses)</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>euscrawl_filtered</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EusCrawl (filtered: no Wikipedia, no NC-licenses)</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>EusCrawl (http://www.ixa.eus/euscrawl/) is a high-quality corpus for Basque comprising 12.5 million documents and 423 million tokens, totalling 2.1 GiB of uncompressed text. EusCrawl was built using ad-hoc scrapers to extract text from 33 Basque websites with high-quality content, resulting in cleaner text compared to general purpose approaches.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[None]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC-BY-SA (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>423 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-eu">Wikibooks [eu]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [eu]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-eu">Wikinews [eu]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [eu]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-eu">Wikipedia [eu]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [eu]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-eu">Wikiquote [eu]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [eu]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-eu">Wikisource [eu]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_eu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [eu]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_fa/index.html b/datasets/language_fa/index.html
new file mode 100644
index 0000000..746057b
--- /dev/null
+++ b/datasets/language_fa/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_fa/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Persian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#persian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Persian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fa-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fa; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fa-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fa; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fa-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fa; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fa-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fa; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fa-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fa; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fa-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fa; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fa-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fa; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fa-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fa; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fa-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fa; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fa-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fa; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fa-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fa; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fa-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fa; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="persian-datasets">Persian Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Persian language.</p>
+<h2 id="colossal-oscar-1-fa-2015-14">Colossal OSCAR 1 [fa; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_fa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fa; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fa-2016-40">Colossal OSCAR 1 [fa; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_fa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fa; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fa-2017-43">Colossal OSCAR 1 [fa; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_fa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fa; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fa-2018-47">Colossal OSCAR 1 [fa; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_fa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fa; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fa-2019-22">Colossal OSCAR 1 [fa; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_fa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fa; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fa-2020-24">Colossal OSCAR 1 [fa; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_fa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fa; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fa-2020-45">Colossal OSCAR 1 [fa; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_fa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fa; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fa-2021-49">Colossal OSCAR 1 [fa; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_fa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fa; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fa-2022-27">Colossal OSCAR 1 [fa; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_fa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fa; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fa-2022-49">Colossal OSCAR 1 [fa; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_fa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fa; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fa-2023-14">Colossal OSCAR 1 [fa; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_fa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fa; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fa-2023-23">Colossal OSCAR 1 [fa; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_fa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fa; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_fi/index.html b/datasets/language_fi/index.html
new file mode 100644
index 0000000..f6c7f83
--- /dev/null
+++ b/datasets/language_fi/index.html
@@ -0,0 +1,1545 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_fi/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Finnish Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#finnish-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Finnish Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fi-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fi; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fi-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fi; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fi-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fi; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fi-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fi; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fi-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fi; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fi-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fi; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fi-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fi; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fi-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fi; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fi-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fi; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fi-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fi; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fi-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fi; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fi-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fi; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-fi" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [fi]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-fi" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [fi]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-fi" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [fi]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-fi" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [fi]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-fi" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [fi]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-fi" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [fi]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-fi" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [fi]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-fi" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [fi]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#yle-finnish-news-archive" class="md-nav__link">
+    <span class="md-ellipsis">
+      Yle Finnish News Archive
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="finnish-datasets">Finnish Datasets</h1>
+<p>There are in total 21 datasets with 9 B tokens in Finnish language.</p>
+<h2 id="colossal-oscar-1-fi-2015-14">Colossal OSCAR 1 [fi; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fi; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fi-2016-40">Colossal OSCAR 1 [fi; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fi; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fi-2017-43">Colossal OSCAR 1 [fi; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fi; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fi-2018-47">Colossal OSCAR 1 [fi; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fi; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fi-2019-22">Colossal OSCAR 1 [fi; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fi; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fi-2020-24">Colossal OSCAR 1 [fi; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fi; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fi-2020-45">Colossal OSCAR 1 [fi; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fi; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fi-2021-49">Colossal OSCAR 1 [fi; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fi; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fi-2022-27">Colossal OSCAR 1 [fi; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fi; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fi-2022-49">Colossal OSCAR 1 [fi; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fi; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fi-2023-14">Colossal OSCAR 1 [fi; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fi; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fi-2023-23">Colossal OSCAR 1 [fi; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fi; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-fi">EurlexResources [fi]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [fi]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-fi">LegalMC4 [fi]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [fi]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>63 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-fi">Wikibooks [fi]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [fi]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>5 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-fi">Wikinews [fi]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [fi]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>748 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-fi">Wikipedia [fi]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [fi]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>137 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-fi">Wikiquote [fi]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [fi]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-fi">Wikisource [fi]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [fi]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>18 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-fi">Wikivoyage [fi]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_fi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [fi]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="yle-finnish-news-archive">Yle Finnish News Archive</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>ylenews</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Yle Finnish News Archive</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The corpus, containing the articles from YLE https://yle.fi from 2019 and 2020, is available at www.kielipankki.fi/download</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[http://urn.fi/urn:nbn:fi:lb-2021050401]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CLARIN ACA - NC (Academic - Non Commercial Use, Attribution, No Redistribution, Other) (commercial use: False, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_fr/index.html b/datasets/language_fr/index.html
new file mode 100644
index 0000000..90ab80e
--- /dev/null
+++ b/datasets/language_fr/index.html
@@ -0,0 +1,1593 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_fr/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>French Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#french-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              French Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#cabernet-a-new-french-balanced-reference-corpus" class="md-nav__link">
+    <span class="md-ellipsis">
+      CaBeRnet: a New French Balanced Reference Corpus
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fr-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fr; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fr-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fr; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fr-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fr; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fr-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fr; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fr-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fr; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fr-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fr; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fr-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fr; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fr-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fr; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fr-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fr; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fr-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fr; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fr-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fr; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fr-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fr; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-fr" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [fr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-fr" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [fr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wura-french" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [French]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-fr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [fr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-fr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [fr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-fr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [fr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-fr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [fr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-fr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [fr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-fr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [fr]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="french-datasets">French Datasets</h1>
+<p>There are in total 22 datasets with 60 B tokens in French language.</p>
+<h2 id="cabernet-a-new-french-balanced-reference-corpus">CaBeRnet: a New French Balanced Reference Corpus</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>cabernet</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>CaBeRnet: a New French Balanced Reference Corpus</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A new balanced French corpus, CaBeRnet, that features a representative range of language usage, including a balanced variety of genres (oral transcriptions, newspapers, popular magazines, technical reports, fiction, academic texts), in oral and written styles.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://aclanthology.org/2020.cmlc-1.3/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons License (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>712 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fr-2015-14">Colossal OSCAR 1 [fr; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fr; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fr-2016-40">Colossal OSCAR 1 [fr; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fr; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fr-2017-43">Colossal OSCAR 1 [fr; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fr; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fr-2018-47">Colossal OSCAR 1 [fr; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fr; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fr-2019-22">Colossal OSCAR 1 [fr; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fr; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fr-2020-24">Colossal OSCAR 1 [fr; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fr; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fr-2020-45">Colossal OSCAR 1 [fr; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fr; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fr-2021-49">Colossal OSCAR 1 [fr; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fr; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fr-2022-27">Colossal OSCAR 1 [fr; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fr; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fr-2022-49">Colossal OSCAR 1 [fr; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fr; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fr-2023-14">Colossal OSCAR 1 [fr; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fr; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fr-2023-23">Colossal OSCAR 1 [fr; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fr; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>48 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-fr">EurlexResources [fr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [fr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>8 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-fr">LegalMC4 [fr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [fr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wura-french">WURA [French]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [French]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-fr">Wikibooks [fr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [fr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>24 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-fr">Wikinews [fr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [fr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>8 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-fr">Wikipedia [fr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [fr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-fr">Wikiquote [fr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [fr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>473 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-fr">Wikisource [fr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [fr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>38 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-fr">Wikivoyage [fr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_fr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [fr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>7 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_fy/index.html b/datasets/language_fy/index.html
new file mode 100644
index 0000000..653fe78
--- /dev/null
+++ b/datasets/language_fy/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_fy/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Western Frisian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#western-frisian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Western Frisian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fy-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fy; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fy-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fy; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fy-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fy; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fy-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fy; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fy-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fy; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fy-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fy; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fy-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fy; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fy-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fy; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fy-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fy; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fy-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fy; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fy-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fy; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-fy-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [fy; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="western-frisian-datasets">Western Frisian Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Western Frisian language.</p>
+<h2 id="colossal-oscar-1-fy-2015-14">Colossal OSCAR 1 [fy; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_fy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fy; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fy-2016-40">Colossal OSCAR 1 [fy; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_fy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fy; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fy-2017-43">Colossal OSCAR 1 [fy; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_fy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fy; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fy-2018-47">Colossal OSCAR 1 [fy; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_fy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fy; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fy-2019-22">Colossal OSCAR 1 [fy; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_fy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fy; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fy-2020-24">Colossal OSCAR 1 [fy; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_fy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fy; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fy-2020-45">Colossal OSCAR 1 [fy; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_fy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fy; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fy-2021-49">Colossal OSCAR 1 [fy; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_fy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fy; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fy-2022-27">Colossal OSCAR 1 [fy; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_fy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fy; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fy-2022-49">Colossal OSCAR 1 [fy; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_fy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fy; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fy-2023-14">Colossal OSCAR 1 [fy; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_fy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fy; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-fy-2023-23">Colossal OSCAR 1 [fy; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_fy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [fy; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ga/index.html b/datasets/language_ga/index.html
new file mode 100644
index 0000000..c35a2d0
--- /dev/null
+++ b/datasets/language_ga/index.html
@@ -0,0 +1,1457 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ga/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Irish Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#irish-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Irish Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ga-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ga; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ga-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ga; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ga-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ga; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ga-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ga; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ga-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ga; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ga-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ga; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ga-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ga; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ga-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ga; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ga-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ga; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ga-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ga; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ga-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ga; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ga-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ga; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-ga" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [ga]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#irish-universal-dependencies" class="md-nav__link">
+    <span class="md-ellipsis">
+      Irish Universal Dependencies
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-ga" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [ga]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#the-gaois-bilingual-corpus-of-english-irish-legislation-irish-legislation" class="md-nav__link">
+    <span class="md-ellipsis">
+      The Gaois bilingual corpus of English-Irish legislation (Irish legislation)
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-ga" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [ga]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-ga" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [ga]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-ga" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [ga]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="irish-datasets">Irish Datasets</h1>
+<p>There are in total 19 datasets with 669 M tokens in Irish language.</p>
+<h2 id="colossal-oscar-1-ga-2015-14">Colossal OSCAR 1 [ga; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ga; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ga-2016-40">Colossal OSCAR 1 [ga; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ga; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ga-2017-43">Colossal OSCAR 1 [ga; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ga; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ga-2018-47">Colossal OSCAR 1 [ga; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ga; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ga-2019-22">Colossal OSCAR 1 [ga; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ga; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ga-2020-24">Colossal OSCAR 1 [ga; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ga; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ga-2020-45">Colossal OSCAR 1 [ga; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ga; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ga-2021-49">Colossal OSCAR 1 [ga; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ga; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ga-2022-27">Colossal OSCAR 1 [ga; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ga; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ga-2022-49">Colossal OSCAR 1 [ga; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ga; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ga-2023-14">Colossal OSCAR 1 [ga; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ga; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ga-2023-23">Colossal OSCAR 1 [ga; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ga; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>5 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-ga">EurlexResources [ga]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [ga]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>650 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="irish-universal-dependencies">Irish Universal Dependencies</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>ga_universal_dependencies</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Irish Universal Dependencies</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Universal Dependencies (UD) is a framework for consistent annotation of grammar (parts of speech, morphological features, and syntactic dependencies) across different human languages.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://universaldependencies.org/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed (CC BY-SA 3.0 or CC BY-SA 4.0) (commercial use: True, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-ga">LegalMC4 [ga]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [ga]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>33 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="the-gaois-bilingual-corpus-of-english-irish-legislation-irish-legislation">The Gaois bilingual corpus of English-Irish legislation (Irish legislation)</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>ga_bilingual_legistation</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>The Gaois bilingual corpus of English-Irish legislation (Irish legislation)</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Bilingual corpus of English-Irish legislation provided by the Department of Justice.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://portulanclarin.net/repository/browse/the-gaois-bilingual-corpus-of-english-irish-legislation-processed/daeac17c9e3511ea9b7f02420a000407b83de243dc0b469aab41084386c5b80f/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Open Under - PSI (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-ga">Wikibooks [ga]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [ga]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-ga">Wikipedia [ga]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [ga]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>6 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-ga">Wikiquote [ga]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_ga</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [ga]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>233</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_gd/index.html b/datasets/language_gd/index.html
new file mode 100644
index 0000000..c8c43b3
--- /dev/null
+++ b/datasets/language_gd/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_gd/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Gaelic Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#gaelic-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Gaelic Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gd-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gd; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gd-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gd; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gd-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gd; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gd-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gd; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gd-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gd; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gd-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gd; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gd-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gd; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gd-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gd; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gd-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gd; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gd-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gd; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gd-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gd; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gd-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gd; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="gaelic-datasets">Gaelic Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Gaelic language.</p>
+<h2 id="colossal-oscar-1-gd-2015-14">Colossal OSCAR 1 [gd; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_gd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gd; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gd-2016-40">Colossal OSCAR 1 [gd; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_gd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gd; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gd-2017-43">Colossal OSCAR 1 [gd; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_gd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gd; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gd-2018-47">Colossal OSCAR 1 [gd; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_gd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gd; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gd-2019-22">Colossal OSCAR 1 [gd; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_gd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gd; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gd-2020-24">Colossal OSCAR 1 [gd; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_gd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gd; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gd-2020-45">Colossal OSCAR 1 [gd; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_gd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gd; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gd-2021-49">Colossal OSCAR 1 [gd; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_gd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gd; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gd-2022-27">Colossal OSCAR 1 [gd; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_gd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gd; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gd-2022-49">Colossal OSCAR 1 [gd; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_gd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gd; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gd-2023-14">Colossal OSCAR 1 [gd; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_gd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gd; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gd-2023-23">Colossal OSCAR 1 [gd; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_gd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gd; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_gl/index.html b/datasets/language_gl/index.html
new file mode 100644
index 0000000..a260701
--- /dev/null
+++ b/datasets/language_gl/index.html
@@ -0,0 +1,1369 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_gl/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Galician Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#galician-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Galician Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gl-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gl; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gl-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gl; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gl-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gl; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gl-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gl; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gl-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gl; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gl-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gl; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gl-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gl; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gl-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gl; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gl-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gl; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gl-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gl; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gl-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gl; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gl-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gl; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-gl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [gl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-gl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [gl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-gl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [gl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-gl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [gl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-gl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [gl]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="galician-datasets">Galician Datasets</h1>
+<p>There are in total 17 datasets with 36 M tokens in Galician language.</p>
+<h2 id="colossal-oscar-1-gl-2015-14">Colossal OSCAR 1 [gl; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gl; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gl-2016-40">Colossal OSCAR 1 [gl; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gl; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gl-2017-43">Colossal OSCAR 1 [gl; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gl; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gl-2018-47">Colossal OSCAR 1 [gl; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gl; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gl-2019-22">Colossal OSCAR 1 [gl; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gl; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gl-2020-24">Colossal OSCAR 1 [gl; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gl; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gl-2020-45">Colossal OSCAR 1 [gl; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gl; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gl-2021-49">Colossal OSCAR 1 [gl; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gl; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gl-2022-27">Colossal OSCAR 1 [gl; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gl; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gl-2022-49">Colossal OSCAR 1 [gl; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gl; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gl-2023-14">Colossal OSCAR 1 [gl; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gl; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gl-2023-23">Colossal OSCAR 1 [gl; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gl; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>36 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-gl">Wikibooks [gl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [gl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-gl">Wikinews [gl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [gl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-gl">Wikipedia [gl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [gl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-gl">Wikiquote [gl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [gl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-gl">Wikisource [gl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_gl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [gl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_gn/index.html b/datasets/language_gn/index.html
new file mode 100644
index 0000000..973e3e5
--- /dev/null
+++ b/datasets/language_gn/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_gn/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Guaraní Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#guarani-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Guaraní Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gn-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gn; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gn-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gn; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gn-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gn; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gn-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gn; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gn-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gn; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gn-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gn; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gn-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gn; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gn-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gn; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gn-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gn; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gn-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gn; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gn-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gn; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gn-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gn; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="guarani-datasets">Guaraní Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Guaraní language.</p>
+<h2 id="colossal-oscar-1-gn-2015-14">Colossal OSCAR 1 [gn; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_gn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gn; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gn-2016-40">Colossal OSCAR 1 [gn; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_gn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gn; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gn-2017-43">Colossal OSCAR 1 [gn; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_gn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gn; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gn-2018-47">Colossal OSCAR 1 [gn; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_gn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gn; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gn-2019-22">Colossal OSCAR 1 [gn; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_gn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gn; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gn-2020-24">Colossal OSCAR 1 [gn; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_gn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gn; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gn-2020-45">Colossal OSCAR 1 [gn; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_gn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gn; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gn-2021-49">Colossal OSCAR 1 [gn; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_gn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gn; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gn-2022-27">Colossal OSCAR 1 [gn; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_gn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gn; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gn-2022-49">Colossal OSCAR 1 [gn; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_gn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gn; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gn-2023-14">Colossal OSCAR 1 [gn; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_gn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gn; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gn-2023-23">Colossal OSCAR 1 [gn; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_gn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gn; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_gom/index.html b/datasets/language_gom/index.html
new file mode 100644
index 0000000..8985b85
--- /dev/null
+++ b/datasets/language_gom/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_gom/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Gom Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#gom-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Gom Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gom-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gom; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gom-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gom; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gom-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gom; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gom-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gom; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gom-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gom; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gom-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gom; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gom-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gom; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gom-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gom; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gom-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gom; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gom-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gom; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gom-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gom; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gom-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gom; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="gom-datasets">Gom Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Gom language.</p>
+<h2 id="colossal-oscar-1-gom-2015-14">Colossal OSCAR 1 [gom; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_gom</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gom; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gom-2016-40">Colossal OSCAR 1 [gom; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_gom</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gom; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gom-2017-43">Colossal OSCAR 1 [gom; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_gom</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gom; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gom-2018-47">Colossal OSCAR 1 [gom; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_gom</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gom; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gom-2019-22">Colossal OSCAR 1 [gom; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_gom</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gom; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gom-2020-24">Colossal OSCAR 1 [gom; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_gom</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gom; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gom-2020-45">Colossal OSCAR 1 [gom; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_gom</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gom; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gom-2021-49">Colossal OSCAR 1 [gom; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_gom</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gom; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gom-2022-27">Colossal OSCAR 1 [gom; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_gom</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gom; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gom-2022-49">Colossal OSCAR 1 [gom; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_gom</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gom; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gom-2023-14">Colossal OSCAR 1 [gom; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_gom</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gom; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gom-2023-23">Colossal OSCAR 1 [gom; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_gom</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gom; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_gsw/index.html b/datasets/language_gsw/index.html
new file mode 100644
index 0000000..9032d7b
--- /dev/null
+++ b/datasets/language_gsw/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_gsw/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Gsw Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#gsw-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Gsw Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gsw-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gsw; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gsw-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gsw; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gsw-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gsw; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gsw-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gsw; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gsw-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gsw; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gsw-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gsw; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gsw-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gsw; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gsw-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gsw; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gsw-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gsw; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gsw-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gsw; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gsw-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gsw; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gsw-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gsw; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="gsw-datasets">Gsw Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Gsw language.</p>
+<h2 id="colossal-oscar-1-gsw-2015-14">Colossal OSCAR 1 [gsw; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_gsw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gsw; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gsw-2016-40">Colossal OSCAR 1 [gsw; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_gsw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gsw; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gsw-2017-43">Colossal OSCAR 1 [gsw; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_gsw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gsw; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gsw-2018-47">Colossal OSCAR 1 [gsw; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_gsw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gsw; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gsw-2019-22">Colossal OSCAR 1 [gsw; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_gsw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gsw; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gsw-2020-24">Colossal OSCAR 1 [gsw; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_gsw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gsw; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gsw-2020-45">Colossal OSCAR 1 [gsw; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_gsw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gsw; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gsw-2021-49">Colossal OSCAR 1 [gsw; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_gsw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gsw; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gsw-2022-27">Colossal OSCAR 1 [gsw; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_gsw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gsw; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gsw-2022-49">Colossal OSCAR 1 [gsw; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_gsw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gsw; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gsw-2023-14">Colossal OSCAR 1 [gsw; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_gsw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gsw; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gsw-2023-23">Colossal OSCAR 1 [gsw; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_gsw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gsw; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_gu/index.html b/datasets/language_gu/index.html
new file mode 100644
index 0000000..087904a
--- /dev/null
+++ b/datasets/language_gu/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_gu/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Gujarati Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#gujarati-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Gujarati Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gu-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gu; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gu-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gu; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gu-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gu; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gu-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gu; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gu-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gu; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gu-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gu; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gu-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gu; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gu-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gu; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gu-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gu; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gu-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gu; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gu-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gu; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-gu-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [gu; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="gujarati-datasets">Gujarati Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Gujarati language.</p>
+<h2 id="colossal-oscar-1-gu-2015-14">Colossal OSCAR 1 [gu; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_gu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gu; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gu-2016-40">Colossal OSCAR 1 [gu; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_gu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gu; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gu-2017-43">Colossal OSCAR 1 [gu; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_gu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gu; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gu-2018-47">Colossal OSCAR 1 [gu; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_gu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gu; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gu-2019-22">Colossal OSCAR 1 [gu; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_gu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gu; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gu-2020-24">Colossal OSCAR 1 [gu; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_gu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gu; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gu-2020-45">Colossal OSCAR 1 [gu; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_gu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gu; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gu-2021-49">Colossal OSCAR 1 [gu; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_gu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gu; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gu-2022-27">Colossal OSCAR 1 [gu; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_gu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gu; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gu-2022-49">Colossal OSCAR 1 [gu; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_gu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gu; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gu-2023-14">Colossal OSCAR 1 [gu; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_gu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gu; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-gu-2023-23">Colossal OSCAR 1 [gu; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_gu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [gu; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ha/index.html b/datasets/language_ha/index.html
new file mode 100644
index 0000000..db7136e
--- /dev/null
+++ b/datasets/language_ha/index.html
@@ -0,0 +1,669 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ha/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Hausa Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#hausa-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Hausa Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#wura-hausa" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Hausa]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="hausa-datasets">Hausa Datasets</h1>
+<p>There are in total 1 datasets with N/A tokens in Hausa language.</p>
+<h2 id="wura-hausa">WURA [Hausa]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_ha</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Hausa]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_he/index.html b/datasets/language_he/index.html
new file mode 100644
index 0000000..2595b79
--- /dev/null
+++ b/datasets/language_he/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_he/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Hebrew Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#hebrew-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Hebrew Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-he-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [he; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-he-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [he; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-he-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [he; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-he-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [he; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-he-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [he; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-he-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [he; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-he-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [he; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-he-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [he; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-he-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [he; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-he-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [he; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-he-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [he; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-he-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [he; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="hebrew-datasets">Hebrew Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Hebrew language.</p>
+<h2 id="colossal-oscar-1-he-2015-14">Colossal OSCAR 1 [he; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_he</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [he; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-he-2016-40">Colossal OSCAR 1 [he; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_he</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [he; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-he-2017-43">Colossal OSCAR 1 [he; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_he</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [he; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-he-2018-47">Colossal OSCAR 1 [he; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_he</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [he; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-he-2019-22">Colossal OSCAR 1 [he; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_he</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [he; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-he-2020-24">Colossal OSCAR 1 [he; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_he</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [he; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-he-2020-45">Colossal OSCAR 1 [he; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_he</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [he; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-he-2021-49">Colossal OSCAR 1 [he; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_he</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [he; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-he-2022-27">Colossal OSCAR 1 [he; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_he</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [he; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-he-2022-49">Colossal OSCAR 1 [he; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_he</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [he; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-he-2023-14">Colossal OSCAR 1 [he; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_he</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [he; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-he-2023-23">Colossal OSCAR 1 [he; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_he</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [he; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_hi/index.html b/datasets/language_hi/index.html
new file mode 100644
index 0000000..929bdfd
--- /dev/null
+++ b/datasets/language_hi/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_hi/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Hindi Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#hindi-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Hindi Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hi-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hi; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hi-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hi; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hi-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hi; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hi-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hi; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hi-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hi; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hi-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hi; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hi-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hi; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hi-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hi; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hi-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hi; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hi-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hi; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hi-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hi; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hi-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hi; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="hindi-datasets">Hindi Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Hindi language.</p>
+<h2 id="colossal-oscar-1-hi-2015-14">Colossal OSCAR 1 [hi; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_hi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hi; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hi-2016-40">Colossal OSCAR 1 [hi; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_hi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hi; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hi-2017-43">Colossal OSCAR 1 [hi; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_hi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hi; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hi-2018-47">Colossal OSCAR 1 [hi; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_hi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hi; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hi-2019-22">Colossal OSCAR 1 [hi; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_hi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hi; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hi-2020-24">Colossal OSCAR 1 [hi; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_hi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hi; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hi-2020-45">Colossal OSCAR 1 [hi; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_hi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hi; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hi-2021-49">Colossal OSCAR 1 [hi; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_hi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hi; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hi-2022-27">Colossal OSCAR 1 [hi; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_hi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hi; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hi-2022-49">Colossal OSCAR 1 [hi; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_hi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hi; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hi-2023-14">Colossal OSCAR 1 [hi; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_hi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hi; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hi-2023-23">Colossal OSCAR 1 [hi; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_hi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hi; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_hr/index.html b/datasets/language_hr/index.html
new file mode 100644
index 0000000..cf4900a
--- /dev/null
+++ b/datasets/language_hr/index.html
@@ -0,0 +1,1589 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_hr/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Croatian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#croatian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Croatian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#24sata-news-article-archive-10" class="md-nav__link">
+    <span class="md-ellipsis">
+      24sata news article archive 1.0
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#curlicat-corpus-croatian" class="md-nav__link">
+    <span class="md-ellipsis">
+      CURLICAT Corpus [Croatian]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hr-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hr; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hr-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hr; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hr-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hr; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hr-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hr; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hr-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hr; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hr-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hr; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hr-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hr; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hr-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hr; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hr-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hr; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hr-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hr; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hr-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hr; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hr-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hr; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#corpus-of-croatian-news-portals-engri-2014-2018" class="md-nav__link">
+    <span class="md-ellipsis">
+      Corpus of Croatian news portals ENGRI (2014-2018)
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#croatian-web-corpus-hrwac-21" class="md-nav__link">
+    <span class="md-ellipsis">
+      Croatian web corpus hrWaC 2.1
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-hr" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [hr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#macocu-web-corpus-croatian" class="md-nav__link">
+    <span class="md-ellipsis">
+      MaCoCu web corpus [Croatian]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-hr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [hr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-hr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [hr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-hr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [hr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-hr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [hr]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="croatian-datasets">Croatian Datasets</h1>
+<p>There are in total 22 datasets with 8 B tokens in Croatian language.</p>
+<h2 id="24sata-news-article-archive-10">24sata news article archive 1.0</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>styria_news</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>24sata news article archive 1.0</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The 24sata news portal consists of a portal with daily news and several smaller portals covering news from specific topics, such as automotive news, health, culinary content, and lifestyle advice. The dataset contains over  650,000 articles in Croatian from 2007 to 2019, as well as assigned tags.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://www.clarin.si/repository/xmlui/handle/11356/1410]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons - Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) (commercial use: False, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>409 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="curlicat-corpus-croatian">CURLICAT Corpus [Croatian]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>curlicat_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>CURLICAT Corpus [Croatian]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The CURLICAT corpus includes 7 monolingual corpora (Bulgarian, Croatian, Hungarian, Polish, Romanian, Slovak and Slovenian) containing selected samples from respective national corpora.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://elrc-share.eu/repository/browse/curlicat-croatian-corpus/00815518592811ed9c1a00155d026706bc4c59740fce4f7986213e7eef133023/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>unknown (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>49 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hr-2015-14">Colossal OSCAR 1 [hr; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hr; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hr-2016-40">Colossal OSCAR 1 [hr; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hr; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hr-2017-43">Colossal OSCAR 1 [hr; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hr; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hr-2018-47">Colossal OSCAR 1 [hr; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hr; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hr-2019-22">Colossal OSCAR 1 [hr; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hr; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hr-2020-24">Colossal OSCAR 1 [hr; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hr; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hr-2020-45">Colossal OSCAR 1 [hr; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hr; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hr-2021-49">Colossal OSCAR 1 [hr; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hr; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hr-2022-27">Colossal OSCAR 1 [hr; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hr; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hr-2022-49">Colossal OSCAR 1 [hr; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hr; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hr-2023-14">Colossal OSCAR 1 [hr; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hr; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hr-2023-23">Colossal OSCAR 1 [hr; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hr; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="corpus-of-croatian-news-portals-engri-2014-2018">Corpus of Croatian news portals ENGRI (2014-2018)</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>croatian_news_engri</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Corpus of Croatian news portals ENGRI (2014-2018)</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The corpus consists of texts collected from the most popular (based on the Reuters Institute Digital News Report for 2018, retrieved from http://www.digitalnewsreport.org in April, 2019) news portals in Croatia in the period from 2014 to 2018: Direktno, Dnevno, Net Hr, Hrt, Index_Hr, Jutarnji, Novilist, Rtl, SlobodnaDalmacija, Večernji, Tportal, Dnevnik.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://repository.pfri.uniri.hr/islandora/object/pfri%3A2156]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons - Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0) (commercial use: False, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>695 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="croatian-web-corpus-hrwac-21">Croatian web corpus hrWaC 2.1</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>hrwac</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Croatian web corpus hrWaC 2.1</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>hrWaC is a web corpus collected from the .hr top-level domain. The current version of the corpus (v2.0) contains 1.9 billion tokens and is annotated with the lemma, morphosyntax and dependency syntax layers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[http://nlp.ffzg.hr/resources/corpora/hrwac/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC-BY-SA license (commercial use: True, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-hr">EurlexResources [hr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [hr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="macocu-web-corpus-croatian">MaCoCu web corpus [Croatian]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>macocu_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>MaCoCu web corpus [Croatian]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://www.clarin.si/repository/xmlui/handle/11356/1806]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC0-No Rights Reserved (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-hr">Wikibooks [hr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [hr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>538 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-hr">Wikipedia [hr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [hr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>65 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-hr">Wikiquote [hr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [hr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>995 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-hr">Wikisource [hr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_hr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [hr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>20 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_hsb/index.html b/datasets/language_hsb/index.html
new file mode 100644
index 0000000..d21cae4
--- /dev/null
+++ b/datasets/language_hsb/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_hsb/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Hsb Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#hsb-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Hsb Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hsb-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hsb; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hsb-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hsb; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hsb-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hsb; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hsb-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hsb; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hsb-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hsb; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hsb-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hsb; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hsb-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hsb; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hsb-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hsb; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hsb-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hsb; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hsb-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hsb; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hsb-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hsb; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hsb-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hsb; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="hsb-datasets">Hsb Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Hsb language.</p>
+<h2 id="colossal-oscar-1-hsb-2015-14">Colossal OSCAR 1 [hsb; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_hsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hsb; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hsb-2016-40">Colossal OSCAR 1 [hsb; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_hsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hsb; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hsb-2017-43">Colossal OSCAR 1 [hsb; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_hsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hsb; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hsb-2018-47">Colossal OSCAR 1 [hsb; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_hsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hsb; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hsb-2019-22">Colossal OSCAR 1 [hsb; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_hsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hsb; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hsb-2020-24">Colossal OSCAR 1 [hsb; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_hsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hsb; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hsb-2020-45">Colossal OSCAR 1 [hsb; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_hsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hsb; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hsb-2021-49">Colossal OSCAR 1 [hsb; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_hsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hsb; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hsb-2022-27">Colossal OSCAR 1 [hsb; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_hsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hsb; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hsb-2022-49">Colossal OSCAR 1 [hsb; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_hsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hsb; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hsb-2023-14">Colossal OSCAR 1 [hsb; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_hsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hsb; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hsb-2023-23">Colossal OSCAR 1 [hsb; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_hsb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hsb; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ht/index.html b/datasets/language_ht/index.html
new file mode 100644
index 0000000..aa22113
--- /dev/null
+++ b/datasets/language_ht/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ht/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Haitian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#haitian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Haitian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ht-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ht; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ht-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ht; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ht-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ht; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ht-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ht; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ht-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ht; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ht-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ht; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ht-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ht; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ht-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ht; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ht-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ht; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ht-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ht; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ht-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ht; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ht-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ht; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="haitian-datasets">Haitian Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Haitian language.</p>
+<h2 id="colossal-oscar-1-ht-2015-14">Colossal OSCAR 1 [ht; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ht</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ht; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ht-2016-40">Colossal OSCAR 1 [ht; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ht</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ht; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ht-2017-43">Colossal OSCAR 1 [ht; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ht</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ht; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ht-2018-47">Colossal OSCAR 1 [ht; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ht</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ht; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ht-2019-22">Colossal OSCAR 1 [ht; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ht</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ht; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ht-2020-24">Colossal OSCAR 1 [ht; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ht</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ht; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ht-2020-45">Colossal OSCAR 1 [ht; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ht</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ht; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ht-2021-49">Colossal OSCAR 1 [ht; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ht</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ht; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ht-2022-27">Colossal OSCAR 1 [ht; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ht</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ht; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ht-2022-49">Colossal OSCAR 1 [ht; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ht</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ht; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ht-2023-14">Colossal OSCAR 1 [ht; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ht</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ht; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ht-2023-23">Colossal OSCAR 1 [ht; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ht</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ht; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_hu/index.html b/datasets/language_hu/index.html
new file mode 100644
index 0000000..2bdbe0e
--- /dev/null
+++ b/datasets/language_hu/index.html
@@ -0,0 +1,1501 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_hu/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Hungarian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#hungarian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Hungarian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#curlicat-corpus-hungarian" class="md-nav__link">
+    <span class="md-ellipsis">
+      CURLICAT Corpus [Hungarian]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hu-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hu; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hu-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hu; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hu-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hu; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hu-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hu; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hu-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hu; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hu-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hu; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hu-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hu; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hu-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hu; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hu-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hu; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hu-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hu; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hu-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hu; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hu-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hu; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-hu" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [hu]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-hu" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [hu]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-hu" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [hu]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-hu" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [hu]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-hu" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [hu]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-hu" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [hu]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-hu" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [hu]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="hungarian-datasets">Hungarian Datasets</h1>
+<p>There are in total 20 datasets with 12 B tokens in Hungarian language.</p>
+<h2 id="curlicat-corpus-hungarian">CURLICAT Corpus [Hungarian]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>curlicat_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>CURLICAT Corpus [Hungarian]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The CURLICAT corpus includes 7 monolingual corpora (Bulgarian, Croatian, Hungarian, Polish, Romanian, Slovak and Slovenian) containing selected samples from respective national corpora.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://elrc-share.eu/repository/browse/curlicat-hungarian-corpus/8b6c8dcb58ea11ed9c1a00155d02670679a453431c8147079e5a7d9b879a9729/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC-BY-SA-4.0 (commercial use: None, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>61 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hu-2015-14">Colossal OSCAR 1 [hu; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hu; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hu-2016-40">Colossal OSCAR 1 [hu; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hu; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hu-2017-43">Colossal OSCAR 1 [hu; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hu; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hu-2018-47">Colossal OSCAR 1 [hu; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hu; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hu-2019-22">Colossal OSCAR 1 [hu; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hu; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hu-2020-24">Colossal OSCAR 1 [hu; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hu; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hu-2020-45">Colossal OSCAR 1 [hu; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hu; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hu-2021-49">Colossal OSCAR 1 [hu; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hu; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hu-2022-27">Colossal OSCAR 1 [hu; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hu; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hu-2022-49">Colossal OSCAR 1 [hu; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hu; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hu-2023-14">Colossal OSCAR 1 [hu; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hu; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hu-2023-23">Colossal OSCAR 1 [hu; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hu; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>7 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-hu">EurlexResources [hu]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [hu]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-hu">LegalMC4 [hu]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [hu]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>245 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-hu">Wikibooks [hu]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [hu]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>19 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-hu">Wikinews [hu]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [hu]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>427 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-hu">Wikipedia [hu]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [hu]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>308 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-hu">Wikiquote [hu]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [hu]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-hu">Wikisource [hu]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_hu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [hu]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>36 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_hy/index.html b/datasets/language_hy/index.html
new file mode 100644
index 0000000..6638581
--- /dev/null
+++ b/datasets/language_hy/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_hy/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Armenian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#armenian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Armenian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hy-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hy; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hy-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hy; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hy-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hy; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hy-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hy; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hy-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hy; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hy-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hy; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hy-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hy; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hy-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hy; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hy-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hy; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hy-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hy; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hy-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hy; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-hy-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [hy; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="armenian-datasets">Armenian Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Armenian language.</p>
+<h2 id="colossal-oscar-1-hy-2015-14">Colossal OSCAR 1 [hy; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_hy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hy; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hy-2016-40">Colossal OSCAR 1 [hy; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_hy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hy; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hy-2017-43">Colossal OSCAR 1 [hy; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_hy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hy; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hy-2018-47">Colossal OSCAR 1 [hy; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_hy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hy; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hy-2019-22">Colossal OSCAR 1 [hy; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_hy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hy; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hy-2020-24">Colossal OSCAR 1 [hy; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_hy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hy; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hy-2020-45">Colossal OSCAR 1 [hy; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_hy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hy; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hy-2021-49">Colossal OSCAR 1 [hy; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_hy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hy; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hy-2022-27">Colossal OSCAR 1 [hy; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_hy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hy; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hy-2022-49">Colossal OSCAR 1 [hy; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_hy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hy; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hy-2023-14">Colossal OSCAR 1 [hy; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_hy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hy; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-hy-2023-23">Colossal OSCAR 1 [hy; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_hy</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [hy; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ia/index.html b/datasets/language_ia/index.html
new file mode 100644
index 0000000..99041ff
--- /dev/null
+++ b/datasets/language_ia/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ia/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Interlingua Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#interlingua-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Interlingua Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ia-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ia; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ia-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ia; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ia-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ia; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ia-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ia; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ia-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ia; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ia-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ia; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ia-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ia; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ia-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ia; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ia-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ia; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ia-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ia; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ia-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ia; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ia-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ia; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="interlingua-datasets">Interlingua Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Interlingua language.</p>
+<h2 id="colossal-oscar-1-ia-2015-14">Colossal OSCAR 1 [ia; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ia</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ia; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ia-2016-40">Colossal OSCAR 1 [ia; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ia</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ia; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ia-2017-43">Colossal OSCAR 1 [ia; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ia</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ia; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ia-2018-47">Colossal OSCAR 1 [ia; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ia</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ia; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ia-2019-22">Colossal OSCAR 1 [ia; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ia</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ia; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ia-2020-24">Colossal OSCAR 1 [ia; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ia</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ia; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ia-2020-45">Colossal OSCAR 1 [ia; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ia</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ia; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ia-2021-49">Colossal OSCAR 1 [ia; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ia</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ia; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ia-2022-27">Colossal OSCAR 1 [ia; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ia</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ia; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ia-2022-49">Colossal OSCAR 1 [ia; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ia</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ia; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ia-2023-14">Colossal OSCAR 1 [ia; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ia</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ia; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ia-2023-23">Colossal OSCAR 1 [ia; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ia</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ia; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_id/index.html b/datasets/language_id/index.html
new file mode 100644
index 0000000..2fb75ec
--- /dev/null
+++ b/datasets/language_id/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_id/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Indonesian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#indonesian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Indonesian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-id-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [id; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-id-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [id; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-id-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [id; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-id-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [id; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-id-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [id; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-id-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [id; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-id-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [id; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-id-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [id; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-id-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [id; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-id-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [id; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-id-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [id; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-id-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [id; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="indonesian-datasets">Indonesian Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Indonesian language.</p>
+<h2 id="colossal-oscar-1-id-2015-14">Colossal OSCAR 1 [id; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_id</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [id; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-id-2016-40">Colossal OSCAR 1 [id; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_id</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [id; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-id-2017-43">Colossal OSCAR 1 [id; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_id</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [id; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-id-2018-47">Colossal OSCAR 1 [id; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_id</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [id; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-id-2019-22">Colossal OSCAR 1 [id; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_id</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [id; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-id-2020-24">Colossal OSCAR 1 [id; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_id</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [id; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-id-2020-45">Colossal OSCAR 1 [id; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_id</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [id; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-id-2021-49">Colossal OSCAR 1 [id; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_id</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [id; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-id-2022-27">Colossal OSCAR 1 [id; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_id</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [id; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-id-2022-49">Colossal OSCAR 1 [id; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_id</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [id; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-id-2023-14">Colossal OSCAR 1 [id; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_id</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [id; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-id-2023-23">Colossal OSCAR 1 [id; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_id</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [id; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ie/index.html b/datasets/language_ie/index.html
new file mode 100644
index 0000000..88f9911
--- /dev/null
+++ b/datasets/language_ie/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ie/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Interlingue Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#interlingue-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Interlingue Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ie-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ie; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ie-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ie; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ie-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ie; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ie-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ie; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ie-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ie; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ie-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ie; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ie-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ie; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ie-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ie; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ie-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ie; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ie-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ie; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ie-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ie; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ie-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ie; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="interlingue-datasets">Interlingue Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Interlingue language.</p>
+<h2 id="colossal-oscar-1-ie-2015-14">Colossal OSCAR 1 [ie; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ie</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ie; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ie-2016-40">Colossal OSCAR 1 [ie; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ie</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ie; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ie-2017-43">Colossal OSCAR 1 [ie; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ie</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ie; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ie-2018-47">Colossal OSCAR 1 [ie; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ie</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ie; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ie-2019-22">Colossal OSCAR 1 [ie; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ie</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ie; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ie-2020-24">Colossal OSCAR 1 [ie; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ie</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ie; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ie-2020-45">Colossal OSCAR 1 [ie; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ie</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ie; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ie-2021-49">Colossal OSCAR 1 [ie; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ie</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ie; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ie-2022-27">Colossal OSCAR 1 [ie; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ie</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ie; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ie-2022-49">Colossal OSCAR 1 [ie; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ie</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ie; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ie-2023-14">Colossal OSCAR 1 [ie; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ie</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ie; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ie-2023-23">Colossal OSCAR 1 [ie; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ie</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ie; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ig/index.html b/datasets/language_ig/index.html
new file mode 100644
index 0000000..76802f0
--- /dev/null
+++ b/datasets/language_ig/index.html
@@ -0,0 +1,669 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ig/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Igbo Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#igbo-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Igbo Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#wura-igbo" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Igbo]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="igbo-datasets">Igbo Datasets</h1>
+<p>There are in total 1 datasets with N/A tokens in Igbo language.</p>
+<h2 id="wura-igbo">WURA [Igbo]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_ig</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Igbo]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ilo/index.html b/datasets/language_ilo/index.html
new file mode 100644
index 0000000..88be788
--- /dev/null
+++ b/datasets/language_ilo/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ilo/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Ilo Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#ilo-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Ilo Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ilo-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ilo; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ilo-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ilo; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ilo-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ilo; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ilo-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ilo; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ilo-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ilo; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ilo-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ilo; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ilo-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ilo; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ilo-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ilo; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ilo-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ilo; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ilo-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ilo; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ilo-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ilo; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ilo-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ilo; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="ilo-datasets">Ilo Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Ilo language.</p>
+<h2 id="colossal-oscar-1-ilo-2015-14">Colossal OSCAR 1 [ilo; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ilo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ilo; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ilo-2016-40">Colossal OSCAR 1 [ilo; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ilo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ilo; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ilo-2017-43">Colossal OSCAR 1 [ilo; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ilo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ilo; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ilo-2018-47">Colossal OSCAR 1 [ilo; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ilo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ilo; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ilo-2019-22">Colossal OSCAR 1 [ilo; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ilo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ilo; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ilo-2020-24">Colossal OSCAR 1 [ilo; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ilo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ilo; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ilo-2020-45">Colossal OSCAR 1 [ilo; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ilo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ilo; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ilo-2021-49">Colossal OSCAR 1 [ilo; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ilo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ilo; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ilo-2022-27">Colossal OSCAR 1 [ilo; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ilo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ilo; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ilo-2022-49">Colossal OSCAR 1 [ilo; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ilo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ilo; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ilo-2023-14">Colossal OSCAR 1 [ilo; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ilo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ilo; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ilo-2023-23">Colossal OSCAR 1 [ilo; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ilo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ilo; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_io/index.html b/datasets/language_io/index.html
new file mode 100644
index 0000000..37cdd79
--- /dev/null
+++ b/datasets/language_io/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_io/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Ido Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#ido-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Ido Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-io-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [io; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-io-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [io; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-io-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [io; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-io-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [io; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-io-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [io; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-io-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [io; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-io-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [io; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-io-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [io; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-io-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [io; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-io-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [io; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-io-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [io; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-io-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [io; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="ido-datasets">Ido Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Ido language.</p>
+<h2 id="colossal-oscar-1-io-2015-14">Colossal OSCAR 1 [io; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_io</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [io; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-io-2016-40">Colossal OSCAR 1 [io; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_io</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [io; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-io-2017-43">Colossal OSCAR 1 [io; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_io</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [io; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-io-2018-47">Colossal OSCAR 1 [io; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_io</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [io; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-io-2019-22">Colossal OSCAR 1 [io; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_io</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [io; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-io-2020-24">Colossal OSCAR 1 [io; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_io</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [io; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-io-2020-45">Colossal OSCAR 1 [io; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_io</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [io; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-io-2021-49">Colossal OSCAR 1 [io; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_io</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [io; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-io-2022-27">Colossal OSCAR 1 [io; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_io</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [io; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-io-2022-49">Colossal OSCAR 1 [io; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_io</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [io; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-io-2023-14">Colossal OSCAR 1 [io; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_io</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [io; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-io-2023-23">Colossal OSCAR 1 [io; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_io</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [io; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_is/index.html b/datasets/language_is/index.html
new file mode 100644
index 0000000..ca0b809
--- /dev/null
+++ b/datasets/language_is/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_is/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Icelandic Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#icelandic-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Icelandic Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-is-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [is; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-is-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [is; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-is-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [is; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-is-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [is; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-is-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [is; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-is-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [is; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-is-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [is; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-is-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [is; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-is-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [is; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-is-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [is; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-is-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [is; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-is-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [is; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="icelandic-datasets">Icelandic Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Icelandic language.</p>
+<h2 id="colossal-oscar-1-is-2015-14">Colossal OSCAR 1 [is; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_is</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [is; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-is-2016-40">Colossal OSCAR 1 [is; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_is</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [is; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-is-2017-43">Colossal OSCAR 1 [is; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_is</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [is; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-is-2018-47">Colossal OSCAR 1 [is; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_is</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [is; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-is-2019-22">Colossal OSCAR 1 [is; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_is</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [is; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-is-2020-24">Colossal OSCAR 1 [is; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_is</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [is; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-is-2020-45">Colossal OSCAR 1 [is; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_is</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [is; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-is-2021-49">Colossal OSCAR 1 [is; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_is</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [is; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-is-2022-27">Colossal OSCAR 1 [is; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_is</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [is; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-is-2022-49">Colossal OSCAR 1 [is; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_is</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [is; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-is-2023-14">Colossal OSCAR 1 [is; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_is</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [is; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-is-2023-23">Colossal OSCAR 1 [is; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_is</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [is; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_it/index.html b/datasets/language_it/index.html
new file mode 100644
index 0000000..c701f9d
--- /dev/null
+++ b/datasets/language_it/index.html
@@ -0,0 +1,1545 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_it/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Italian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#italian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Italian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-it-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [it; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-it-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [it; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-it-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [it; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-it-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [it; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-it-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [it; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-it-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [it; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-it-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [it; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-it-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [it; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-it-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [it; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-it-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [it; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-it-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [it; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-it-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [it; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-it" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [it]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#itwac" class="md-nav__link">
+    <span class="md-ellipsis">
+      ITWaC
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-it" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [it]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-it" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [it]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-it" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [it]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-it" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [it]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-it" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [it]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-it" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [it]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-it" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [it]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="italian-datasets">Italian Datasets</h1>
+<p>There are in total 21 datasets with 14 B tokens in Italian language.</p>
+<h2 id="colossal-oscar-1-it-2015-14">Colossal OSCAR 1 [it; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [it; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-it-2016-40">Colossal OSCAR 1 [it; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [it; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-it-2017-43">Colossal OSCAR 1 [it; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [it; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-it-2018-47">Colossal OSCAR 1 [it; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [it; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-it-2019-22">Colossal OSCAR 1 [it; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [it; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-it-2020-24">Colossal OSCAR 1 [it; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [it; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-it-2020-45">Colossal OSCAR 1 [it; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [it; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-it-2021-49">Colossal OSCAR 1 [it; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [it; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-it-2022-27">Colossal OSCAR 1 [it; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [it; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-it-2022-49">Colossal OSCAR 1 [it; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [it; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-it-2023-14">Colossal OSCAR 1 [it; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [it; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-it-2023-23">Colossal OSCAR 1 [it; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [it; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-it">EurlexResources [it]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [it]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>8 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="itwac">ITWaC</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>itwac</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>ITWaC</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>itWaC: a 2 billion word corpus constructed from the Web limiting the crawl to the .it domain and using medium-frequency words from the Repubblica corpus and basic Italian vocabulary lists as seeds.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>on_request</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://docs.sslmit.unibo.it/doku.php?id=corpora:itwac]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>unknown, likely research-only or fair use (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-it">LegalMC4 [it]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [it]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-it">Wikibooks [it]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [it]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>29 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-it">Wikinews [it]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [it]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>5 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-it">Wikipedia [it]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [it]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>821 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-it">Wikiquote [it]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [it]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>55 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-it">Wikisource [it]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [it]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>66 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-it">Wikivoyage [it]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_it</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [it]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>13 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ja/index.html b/datasets/language_ja/index.html
new file mode 100644
index 0000000..84fda9e
--- /dev/null
+++ b/datasets/language_ja/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ja/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Japanese Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#japanese-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Japanese Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ja-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ja; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ja-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ja; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ja-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ja; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ja-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ja; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ja-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ja; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ja-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ja; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ja-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ja; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ja-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ja; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ja-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ja; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ja-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ja; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ja-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ja; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ja-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ja; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="japanese-datasets">Japanese Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Japanese language.</p>
+<h2 id="colossal-oscar-1-ja-2015-14">Colossal OSCAR 1 [ja; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ja</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ja; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ja-2016-40">Colossal OSCAR 1 [ja; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ja</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ja; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ja-2017-43">Colossal OSCAR 1 [ja; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ja</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ja; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ja-2018-47">Colossal OSCAR 1 [ja; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ja</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ja; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ja-2019-22">Colossal OSCAR 1 [ja; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ja</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ja; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ja-2020-24">Colossal OSCAR 1 [ja; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ja</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ja; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ja-2020-45">Colossal OSCAR 1 [ja; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ja</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ja; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ja-2021-49">Colossal OSCAR 1 [ja; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ja</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ja; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ja-2022-27">Colossal OSCAR 1 [ja; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ja</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ja; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ja-2022-49">Colossal OSCAR 1 [ja; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ja</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ja; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ja-2023-14">Colossal OSCAR 1 [ja; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ja</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ja; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ja-2023-23">Colossal OSCAR 1 [ja; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ja</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ja; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_jbo/index.html b/datasets/language_jbo/index.html
new file mode 100644
index 0000000..ee58146
--- /dev/null
+++ b/datasets/language_jbo/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_jbo/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Jbo Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#jbo-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Jbo Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jbo-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jbo; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jbo-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jbo; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jbo-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jbo; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jbo-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jbo; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jbo-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jbo; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jbo-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jbo; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jbo-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jbo; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jbo-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jbo; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jbo-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jbo; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jbo-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jbo; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jbo-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jbo; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jbo-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jbo; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="jbo-datasets">Jbo Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Jbo language.</p>
+<h2 id="colossal-oscar-1-jbo-2015-14">Colossal OSCAR 1 [jbo; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_jbo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jbo; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jbo-2016-40">Colossal OSCAR 1 [jbo; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_jbo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jbo; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jbo-2017-43">Colossal OSCAR 1 [jbo; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_jbo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jbo; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jbo-2018-47">Colossal OSCAR 1 [jbo; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_jbo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jbo; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jbo-2019-22">Colossal OSCAR 1 [jbo; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_jbo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jbo; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jbo-2020-24">Colossal OSCAR 1 [jbo; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_jbo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jbo; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jbo-2020-45">Colossal OSCAR 1 [jbo; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_jbo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jbo; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jbo-2021-49">Colossal OSCAR 1 [jbo; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_jbo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jbo; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jbo-2022-27">Colossal OSCAR 1 [jbo; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_jbo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jbo; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jbo-2022-49">Colossal OSCAR 1 [jbo; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_jbo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jbo; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jbo-2023-14">Colossal OSCAR 1 [jbo; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_jbo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jbo; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jbo-2023-23">Colossal OSCAR 1 [jbo; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_jbo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jbo; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_jv/index.html b/datasets/language_jv/index.html
new file mode 100644
index 0000000..2b7a8c6
--- /dev/null
+++ b/datasets/language_jv/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_jv/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Javanese Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#javanese-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Javanese Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jv-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jv; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jv-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jv; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jv-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jv; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jv-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jv; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jv-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jv; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jv-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jv; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jv-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jv; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jv-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jv; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jv-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jv; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jv-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jv; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jv-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jv; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-jv-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [jv; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="javanese-datasets">Javanese Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Javanese language.</p>
+<h2 id="colossal-oscar-1-jv-2015-14">Colossal OSCAR 1 [jv; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_jv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jv; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jv-2016-40">Colossal OSCAR 1 [jv; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_jv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jv; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jv-2017-43">Colossal OSCAR 1 [jv; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_jv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jv; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jv-2018-47">Colossal OSCAR 1 [jv; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_jv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jv; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jv-2019-22">Colossal OSCAR 1 [jv; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_jv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jv; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jv-2020-24">Colossal OSCAR 1 [jv; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_jv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jv; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jv-2020-45">Colossal OSCAR 1 [jv; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_jv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jv; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jv-2021-49">Colossal OSCAR 1 [jv; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_jv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jv; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jv-2022-27">Colossal OSCAR 1 [jv; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_jv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jv; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jv-2022-49">Colossal OSCAR 1 [jv; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_jv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jv; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jv-2023-14">Colossal OSCAR 1 [jv; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_jv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jv; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-jv-2023-23">Colossal OSCAR 1 [jv; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_jv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [jv; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ka/index.html b/datasets/language_ka/index.html
new file mode 100644
index 0000000..fd406db
--- /dev/null
+++ b/datasets/language_ka/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ka/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Georgian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#georgian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Georgian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ka-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ka; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ka-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ka; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ka-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ka; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ka-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ka; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ka-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ka; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ka-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ka; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ka-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ka; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ka-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ka; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ka-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ka; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ka-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ka; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ka-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ka; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ka-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ka; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="georgian-datasets">Georgian Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Georgian language.</p>
+<h2 id="colossal-oscar-1-ka-2015-14">Colossal OSCAR 1 [ka; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ka</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ka; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ka-2016-40">Colossal OSCAR 1 [ka; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ka</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ka; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ka-2017-43">Colossal OSCAR 1 [ka; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ka</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ka; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ka-2018-47">Colossal OSCAR 1 [ka; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ka</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ka; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ka-2019-22">Colossal OSCAR 1 [ka; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ka</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ka; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ka-2020-24">Colossal OSCAR 1 [ka; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ka</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ka; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ka-2020-45">Colossal OSCAR 1 [ka; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ka</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ka; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ka-2021-49">Colossal OSCAR 1 [ka; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ka</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ka; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ka-2022-27">Colossal OSCAR 1 [ka; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ka</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ka; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ka-2022-49">Colossal OSCAR 1 [ka; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ka</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ka; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ka-2023-14">Colossal OSCAR 1 [ka; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ka</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ka; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ka-2023-23">Colossal OSCAR 1 [ka; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ka</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ka; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_kk/index.html b/datasets/language_kk/index.html
new file mode 100644
index 0000000..ebe063f
--- /dev/null
+++ b/datasets/language_kk/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_kk/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Kazakh Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#kazakh-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Kazakh Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kk-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kk; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kk-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kk; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kk-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kk; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kk-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kk; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kk-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kk; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kk-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kk; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kk-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kk; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kk-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kk; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kk-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kk; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kk-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kk; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kk-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kk; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kk-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kk; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="kazakh-datasets">Kazakh Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Kazakh language.</p>
+<h2 id="colossal-oscar-1-kk-2015-14">Colossal OSCAR 1 [kk; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_kk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kk; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kk-2016-40">Colossal OSCAR 1 [kk; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_kk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kk; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kk-2017-43">Colossal OSCAR 1 [kk; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_kk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kk; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kk-2018-47">Colossal OSCAR 1 [kk; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_kk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kk; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kk-2019-22">Colossal OSCAR 1 [kk; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_kk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kk; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kk-2020-24">Colossal OSCAR 1 [kk; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_kk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kk; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kk-2020-45">Colossal OSCAR 1 [kk; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_kk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kk; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kk-2021-49">Colossal OSCAR 1 [kk; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_kk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kk; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kk-2022-27">Colossal OSCAR 1 [kk; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_kk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kk; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kk-2022-49">Colossal OSCAR 1 [kk; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_kk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kk; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kk-2023-14">Colossal OSCAR 1 [kk; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_kk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kk; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kk-2023-23">Colossal OSCAR 1 [kk; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_kk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kk; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_km/index.html b/datasets/language_km/index.html
new file mode 100644
index 0000000..c3d466d
--- /dev/null
+++ b/datasets/language_km/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_km/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Khmer Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#khmer-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Khmer Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-km-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [km; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-km-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [km; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-km-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [km; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-km-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [km; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-km-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [km; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-km-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [km; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-km-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [km; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-km-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [km; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-km-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [km; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-km-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [km; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-km-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [km; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-km-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [km; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="khmer-datasets">Khmer Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Khmer language.</p>
+<h2 id="colossal-oscar-1-km-2015-14">Colossal OSCAR 1 [km; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_km</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [km; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-km-2016-40">Colossal OSCAR 1 [km; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_km</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [km; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-km-2017-43">Colossal OSCAR 1 [km; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_km</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [km; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-km-2018-47">Colossal OSCAR 1 [km; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_km</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [km; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-km-2019-22">Colossal OSCAR 1 [km; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_km</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [km; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-km-2020-24">Colossal OSCAR 1 [km; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_km</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [km; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-km-2020-45">Colossal OSCAR 1 [km; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_km</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [km; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-km-2021-49">Colossal OSCAR 1 [km; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_km</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [km; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-km-2022-27">Colossal OSCAR 1 [km; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_km</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [km; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-km-2022-49">Colossal OSCAR 1 [km; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_km</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [km; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-km-2023-14">Colossal OSCAR 1 [km; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_km</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [km; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-km-2023-23">Colossal OSCAR 1 [km; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_km</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [km; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_kn/index.html b/datasets/language_kn/index.html
new file mode 100644
index 0000000..0ab3152
--- /dev/null
+++ b/datasets/language_kn/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_kn/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Kannada Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#kannada-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Kannada Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kn-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kn; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kn-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kn; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kn-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kn; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kn-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kn; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kn-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kn; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kn-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kn; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kn-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kn; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kn-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kn; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kn-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kn; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kn-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kn; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kn-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kn; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kn-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kn; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="kannada-datasets">Kannada Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Kannada language.</p>
+<h2 id="colossal-oscar-1-kn-2015-14">Colossal OSCAR 1 [kn; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_kn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kn; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kn-2016-40">Colossal OSCAR 1 [kn; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_kn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kn; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kn-2017-43">Colossal OSCAR 1 [kn; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_kn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kn; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kn-2018-47">Colossal OSCAR 1 [kn; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_kn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kn; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kn-2019-22">Colossal OSCAR 1 [kn; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_kn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kn; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kn-2020-24">Colossal OSCAR 1 [kn; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_kn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kn; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kn-2020-45">Colossal OSCAR 1 [kn; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_kn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kn; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kn-2021-49">Colossal OSCAR 1 [kn; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_kn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kn; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kn-2022-27">Colossal OSCAR 1 [kn; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_kn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kn; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kn-2022-49">Colossal OSCAR 1 [kn; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_kn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kn; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kn-2023-14">Colossal OSCAR 1 [kn; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_kn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kn; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kn-2023-23">Colossal OSCAR 1 [kn; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_kn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kn; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ko/index.html b/datasets/language_ko/index.html
new file mode 100644
index 0000000..a178ae2
--- /dev/null
+++ b/datasets/language_ko/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ko/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Korean Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#korean-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Korean Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ko-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ko; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ko-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ko; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ko-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ko; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ko-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ko; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ko-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ko; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ko-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ko; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ko-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ko; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ko-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ko; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ko-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ko; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ko-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ko; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ko-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ko; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ko-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ko; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="korean-datasets">Korean Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Korean language.</p>
+<h2 id="colossal-oscar-1-ko-2015-14">Colossal OSCAR 1 [ko; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ko</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ko; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ko-2016-40">Colossal OSCAR 1 [ko; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ko</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ko; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ko-2017-43">Colossal OSCAR 1 [ko; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ko</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ko; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ko-2018-47">Colossal OSCAR 1 [ko; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ko</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ko; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ko-2019-22">Colossal OSCAR 1 [ko; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ko</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ko; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ko-2020-24">Colossal OSCAR 1 [ko; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ko</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ko; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ko-2020-45">Colossal OSCAR 1 [ko; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ko</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ko; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ko-2021-49">Colossal OSCAR 1 [ko; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ko</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ko; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ko-2022-27">Colossal OSCAR 1 [ko; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ko</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ko; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ko-2022-49">Colossal OSCAR 1 [ko; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ko</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ko; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ko-2023-14">Colossal OSCAR 1 [ko; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ko</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ko; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ko-2023-23">Colossal OSCAR 1 [ko; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ko</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ko; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_krc/index.html b/datasets/language_krc/index.html
new file mode 100644
index 0000000..462ac8b
--- /dev/null
+++ b/datasets/language_krc/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_krc/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Krc Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#krc-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Krc Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-krc-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [krc; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-krc-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [krc; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-krc-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [krc; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-krc-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [krc; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-krc-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [krc; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-krc-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [krc; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-krc-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [krc; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-krc-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [krc; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-krc-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [krc; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-krc-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [krc; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-krc-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [krc; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-krc-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [krc; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="krc-datasets">Krc Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Krc language.</p>
+<h2 id="colossal-oscar-1-krc-2015-14">Colossal OSCAR 1 [krc; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_krc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [krc; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-krc-2016-40">Colossal OSCAR 1 [krc; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_krc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [krc; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-krc-2017-43">Colossal OSCAR 1 [krc; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_krc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [krc; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-krc-2018-47">Colossal OSCAR 1 [krc; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_krc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [krc; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-krc-2019-22">Colossal OSCAR 1 [krc; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_krc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [krc; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-krc-2020-24">Colossal OSCAR 1 [krc; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_krc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [krc; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-krc-2020-45">Colossal OSCAR 1 [krc; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_krc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [krc; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-krc-2021-49">Colossal OSCAR 1 [krc; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_krc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [krc; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-krc-2022-27">Colossal OSCAR 1 [krc; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_krc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [krc; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-krc-2022-49">Colossal OSCAR 1 [krc; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_krc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [krc; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-krc-2023-14">Colossal OSCAR 1 [krc; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_krc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [krc; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-krc-2023-23">Colossal OSCAR 1 [krc; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_krc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [krc; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ku/index.html b/datasets/language_ku/index.html
new file mode 100644
index 0000000..6c6b1bf
--- /dev/null
+++ b/datasets/language_ku/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ku/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Kurdish Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#kurdish-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Kurdish Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ku-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ku; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ku-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ku; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ku-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ku; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ku-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ku; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ku-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ku; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ku-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ku; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ku-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ku; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ku-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ku; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ku-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ku; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ku-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ku; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ku-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ku; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ku-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ku; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="kurdish-datasets">Kurdish Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Kurdish language.</p>
+<h2 id="colossal-oscar-1-ku-2015-14">Colossal OSCAR 1 [ku; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ku</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ku; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ku-2016-40">Colossal OSCAR 1 [ku; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ku</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ku; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ku-2017-43">Colossal OSCAR 1 [ku; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ku</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ku; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ku-2018-47">Colossal OSCAR 1 [ku; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ku</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ku; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ku-2019-22">Colossal OSCAR 1 [ku; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ku</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ku; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ku-2020-24">Colossal OSCAR 1 [ku; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ku</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ku; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ku-2020-45">Colossal OSCAR 1 [ku; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ku</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ku; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ku-2021-49">Colossal OSCAR 1 [ku; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ku</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ku; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ku-2022-27">Colossal OSCAR 1 [ku; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ku</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ku; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ku-2022-49">Colossal OSCAR 1 [ku; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ku</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ku; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ku-2023-14">Colossal OSCAR 1 [ku; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ku</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ku; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ku-2023-23">Colossal OSCAR 1 [ku; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ku</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ku; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_kv/index.html b/datasets/language_kv/index.html
new file mode 100644
index 0000000..0fb22e1
--- /dev/null
+++ b/datasets/language_kv/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_kv/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Komi Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#komi-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Komi Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kv-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kv; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kv-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kv; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kv-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kv; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kv-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kv; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kv-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kv; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kv-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kv; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kv-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kv; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kv-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kv; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kv-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kv; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kv-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kv; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kv-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kv; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kv-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kv; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="komi-datasets">Komi Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Komi language.</p>
+<h2 id="colossal-oscar-1-kv-2015-14">Colossal OSCAR 1 [kv; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_kv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kv; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kv-2016-40">Colossal OSCAR 1 [kv; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_kv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kv; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kv-2017-43">Colossal OSCAR 1 [kv; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_kv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kv; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kv-2018-47">Colossal OSCAR 1 [kv; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_kv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kv; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kv-2019-22">Colossal OSCAR 1 [kv; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_kv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kv; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kv-2020-24">Colossal OSCAR 1 [kv; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_kv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kv; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kv-2020-45">Colossal OSCAR 1 [kv; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_kv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kv; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kv-2021-49">Colossal OSCAR 1 [kv; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_kv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kv; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kv-2022-27">Colossal OSCAR 1 [kv; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_kv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kv; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kv-2022-49">Colossal OSCAR 1 [kv; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_kv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kv; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kv-2023-14">Colossal OSCAR 1 [kv; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_kv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kv; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kv-2023-23">Colossal OSCAR 1 [kv; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_kv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kv; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_kw/index.html b/datasets/language_kw/index.html
new file mode 100644
index 0000000..551df86
--- /dev/null
+++ b/datasets/language_kw/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_kw/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Cornish Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#cornish-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Cornish Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kw-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kw; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kw-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kw; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kw-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kw; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kw-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kw; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kw-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kw; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kw-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kw; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kw-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kw; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kw-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kw; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kw-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kw; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kw-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kw; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kw-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kw; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-kw-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [kw; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="cornish-datasets">Cornish Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Cornish language.</p>
+<h2 id="colossal-oscar-1-kw-2015-14">Colossal OSCAR 1 [kw; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_kw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kw; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kw-2016-40">Colossal OSCAR 1 [kw; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_kw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kw; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kw-2017-43">Colossal OSCAR 1 [kw; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_kw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kw; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kw-2018-47">Colossal OSCAR 1 [kw; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_kw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kw; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kw-2019-22">Colossal OSCAR 1 [kw; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_kw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kw; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kw-2020-24">Colossal OSCAR 1 [kw; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_kw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kw; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kw-2020-45">Colossal OSCAR 1 [kw; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_kw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kw; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kw-2021-49">Colossal OSCAR 1 [kw; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_kw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kw; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kw-2022-27">Colossal OSCAR 1 [kw; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_kw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kw; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kw-2022-49">Colossal OSCAR 1 [kw; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_kw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kw; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kw-2023-14">Colossal OSCAR 1 [kw; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_kw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kw; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-kw-2023-23">Colossal OSCAR 1 [kw; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_kw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [kw; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ky/index.html b/datasets/language_ky/index.html
new file mode 100644
index 0000000..8015a4a
--- /dev/null
+++ b/datasets/language_ky/index.html
@@ -0,0 +1,1197 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ky/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Kirghiz Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#kirghiz-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Kirghiz Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ky-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ky; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ky-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ky; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ky-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ky; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ky-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ky; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ky-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ky; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ky-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ky; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ky-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ky; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ky-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ky; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ky-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ky; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ky-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ky; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ky-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ky; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ky-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ky; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wura-kirghiz" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Kirghiz]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="kirghiz-datasets">Kirghiz Datasets</h1>
+<p>There are in total 13 datasets with N/A tokens in Kirghiz language.</p>
+<h2 id="colossal-oscar-1-ky-2015-14">Colossal OSCAR 1 [ky; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ky</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ky; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ky-2016-40">Colossal OSCAR 1 [ky; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ky</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ky; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ky-2017-43">Colossal OSCAR 1 [ky; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ky</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ky; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ky-2018-47">Colossal OSCAR 1 [ky; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ky</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ky; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ky-2019-22">Colossal OSCAR 1 [ky; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ky</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ky; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ky-2020-24">Colossal OSCAR 1 [ky; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ky</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ky; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ky-2020-45">Colossal OSCAR 1 [ky; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ky</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ky; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ky-2021-49">Colossal OSCAR 1 [ky; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ky</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ky; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ky-2022-27">Colossal OSCAR 1 [ky; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ky</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ky; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ky-2022-49">Colossal OSCAR 1 [ky; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ky</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ky; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ky-2023-14">Colossal OSCAR 1 [ky; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ky</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ky; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ky-2023-23">Colossal OSCAR 1 [ky; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ky</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ky; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wura-kirghiz">WURA [Kirghiz]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_mg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Kirghiz]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_la/index.html b/datasets/language_la/index.html
new file mode 100644
index 0000000..bca9532
--- /dev/null
+++ b/datasets/language_la/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_la/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Latin Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#latin-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Latin Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-la-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [la; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-la-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [la; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-la-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [la; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-la-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [la; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-la-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [la; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-la-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [la; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-la-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [la; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-la-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [la; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-la-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [la; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-la-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [la; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-la-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [la; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-la-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [la; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="latin-datasets">Latin Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Latin language.</p>
+<h2 id="colossal-oscar-1-la-2015-14">Colossal OSCAR 1 [la; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_la</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [la; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-la-2016-40">Colossal OSCAR 1 [la; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_la</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [la; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-la-2017-43">Colossal OSCAR 1 [la; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_la</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [la; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-la-2018-47">Colossal OSCAR 1 [la; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_la</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [la; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-la-2019-22">Colossal OSCAR 1 [la; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_la</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [la; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-la-2020-24">Colossal OSCAR 1 [la; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_la</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [la; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-la-2020-45">Colossal OSCAR 1 [la; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_la</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [la; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-la-2021-49">Colossal OSCAR 1 [la; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_la</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [la; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-la-2022-27">Colossal OSCAR 1 [la; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_la</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [la; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-la-2022-49">Colossal OSCAR 1 [la; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_la</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [la; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-la-2023-14">Colossal OSCAR 1 [la; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_la</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [la; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-la-2023-23">Colossal OSCAR 1 [la; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_la</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [la; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_lb/index.html b/datasets/language_lb/index.html
new file mode 100644
index 0000000..5f5209f
--- /dev/null
+++ b/datasets/language_lb/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_lb/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Luxembourgish Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#luxembourgish-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Luxembourgish Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lb-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lb; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lb-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lb; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lb-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lb; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lb-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lb; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lb-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lb; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lb-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lb; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lb-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lb; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lb-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lb; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lb-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lb; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lb-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lb; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lb-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lb; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lb-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lb; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="luxembourgish-datasets">Luxembourgish Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Luxembourgish language.</p>
+<h2 id="colossal-oscar-1-lb-2015-14">Colossal OSCAR 1 [lb; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_lb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lb; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lb-2016-40">Colossal OSCAR 1 [lb; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_lb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lb; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lb-2017-43">Colossal OSCAR 1 [lb; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_lb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lb; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lb-2018-47">Colossal OSCAR 1 [lb; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_lb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lb; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lb-2019-22">Colossal OSCAR 1 [lb; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_lb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lb; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lb-2020-24">Colossal OSCAR 1 [lb; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_lb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lb; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lb-2020-45">Colossal OSCAR 1 [lb; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_lb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lb; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lb-2021-49">Colossal OSCAR 1 [lb; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_lb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lb; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lb-2022-27">Colossal OSCAR 1 [lb; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_lb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lb; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lb-2022-49">Colossal OSCAR 1 [lb; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_lb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lb; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lb-2023-14">Colossal OSCAR 1 [lb; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_lb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lb; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lb-2023-23">Colossal OSCAR 1 [lb; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_lb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lb; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_lez/index.html b/datasets/language_lez/index.html
new file mode 100644
index 0000000..a871570
--- /dev/null
+++ b/datasets/language_lez/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_lez/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Lez Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#lez-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Lez Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lez-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lez; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lez-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lez; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lez-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lez; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lez-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lez; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lez-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lez; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lez-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lez; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lez-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lez; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lez-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lez; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lez-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lez; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lez-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lez; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lez-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lez; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lez-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lez; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="lez-datasets">Lez Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Lez language.</p>
+<h2 id="colossal-oscar-1-lez-2015-14">Colossal OSCAR 1 [lez; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_lez</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lez; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lez-2016-40">Colossal OSCAR 1 [lez; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_lez</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lez; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lez-2017-43">Colossal OSCAR 1 [lez; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_lez</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lez; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lez-2018-47">Colossal OSCAR 1 [lez; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_lez</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lez; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lez-2019-22">Colossal OSCAR 1 [lez; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_lez</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lez; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lez-2020-24">Colossal OSCAR 1 [lez; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_lez</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lez; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lez-2020-45">Colossal OSCAR 1 [lez; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_lez</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lez; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lez-2021-49">Colossal OSCAR 1 [lez; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_lez</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lez; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lez-2022-27">Colossal OSCAR 1 [lez; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_lez</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lez; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lez-2022-49">Colossal OSCAR 1 [lez; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_lez</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lez; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lez-2023-14">Colossal OSCAR 1 [lez; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_lez</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lez; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lez-2023-23">Colossal OSCAR 1 [lez; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_lez</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lez; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_li/index.html b/datasets/language_li/index.html
new file mode 100644
index 0000000..9d901f5
--- /dev/null
+++ b/datasets/language_li/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_li/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Limburgish Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#limburgish-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Limburgish Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-li-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [li; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-li-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [li; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-li-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [li; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-li-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [li; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-li-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [li; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-li-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [li; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-li-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [li; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-li-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [li; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-li-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [li; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-li-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [li; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-li-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [li; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-li-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [li; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="limburgish-datasets">Limburgish Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Limburgish language.</p>
+<h2 id="colossal-oscar-1-li-2015-14">Colossal OSCAR 1 [li; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_li</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [li; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-li-2016-40">Colossal OSCAR 1 [li; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_li</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [li; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-li-2017-43">Colossal OSCAR 1 [li; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_li</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [li; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-li-2018-47">Colossal OSCAR 1 [li; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_li</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [li; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-li-2019-22">Colossal OSCAR 1 [li; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_li</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [li; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-li-2020-24">Colossal OSCAR 1 [li; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_li</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [li; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-li-2020-45">Colossal OSCAR 1 [li; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_li</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [li; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-li-2021-49">Colossal OSCAR 1 [li; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_li</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [li; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-li-2022-27">Colossal OSCAR 1 [li; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_li</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [li; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-li-2022-49">Colossal OSCAR 1 [li; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_li</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [li; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-li-2023-14">Colossal OSCAR 1 [li; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_li</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [li; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-li-2023-23">Colossal OSCAR 1 [li; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_li</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [li; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_lmo/index.html b/datasets/language_lmo/index.html
new file mode 100644
index 0000000..f734df4
--- /dev/null
+++ b/datasets/language_lmo/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_lmo/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Lmo Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#lmo-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Lmo Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lmo-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lmo; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lmo-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lmo; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lmo-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lmo; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lmo-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lmo; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lmo-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lmo; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lmo-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lmo; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lmo-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lmo; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lmo-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lmo; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lmo-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lmo; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lmo-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lmo; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lmo-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lmo; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lmo-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lmo; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="lmo-datasets">Lmo Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Lmo language.</p>
+<h2 id="colossal-oscar-1-lmo-2015-14">Colossal OSCAR 1 [lmo; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_lmo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lmo; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lmo-2016-40">Colossal OSCAR 1 [lmo; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_lmo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lmo; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lmo-2017-43">Colossal OSCAR 1 [lmo; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_lmo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lmo; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lmo-2018-47">Colossal OSCAR 1 [lmo; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_lmo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lmo; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lmo-2019-22">Colossal OSCAR 1 [lmo; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_lmo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lmo; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lmo-2020-24">Colossal OSCAR 1 [lmo; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_lmo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lmo; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lmo-2020-45">Colossal OSCAR 1 [lmo; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_lmo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lmo; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lmo-2021-49">Colossal OSCAR 1 [lmo; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_lmo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lmo; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lmo-2022-27">Colossal OSCAR 1 [lmo; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_lmo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lmo; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lmo-2022-49">Colossal OSCAR 1 [lmo; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_lmo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lmo; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lmo-2023-14">Colossal OSCAR 1 [lmo; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_lmo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lmo; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lmo-2023-23">Colossal OSCAR 1 [lmo; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_lmo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lmo; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_lo/index.html b/datasets/language_lo/index.html
new file mode 100644
index 0000000..2754e61
--- /dev/null
+++ b/datasets/language_lo/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_lo/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Lao Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#lao-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Lao Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lo-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lo; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lo-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lo; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lo-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lo; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lo-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lo; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lo-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lo; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lo-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lo; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lo-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lo; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lo-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lo; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lo-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lo; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lo-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lo; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lo-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lo; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lo-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lo; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="lao-datasets">Lao Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Lao language.</p>
+<h2 id="colossal-oscar-1-lo-2015-14">Colossal OSCAR 1 [lo; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_lo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lo; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lo-2016-40">Colossal OSCAR 1 [lo; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_lo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lo; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lo-2017-43">Colossal OSCAR 1 [lo; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_lo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lo; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lo-2018-47">Colossal OSCAR 1 [lo; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_lo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lo; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lo-2019-22">Colossal OSCAR 1 [lo; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_lo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lo; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lo-2020-24">Colossal OSCAR 1 [lo; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_lo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lo; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lo-2020-45">Colossal OSCAR 1 [lo; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_lo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lo; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lo-2021-49">Colossal OSCAR 1 [lo; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_lo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lo; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lo-2022-27">Colossal OSCAR 1 [lo; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_lo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lo; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lo-2022-49">Colossal OSCAR 1 [lo; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_lo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lo; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lo-2023-14">Colossal OSCAR 1 [lo; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_lo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lo; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lo-2023-23">Colossal OSCAR 1 [lo; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_lo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lo; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_lt/index.html b/datasets/language_lt/index.html
new file mode 100644
index 0000000..aad7182
--- /dev/null
+++ b/datasets/language_lt/index.html
@@ -0,0 +1,1457 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_lt/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Lithuanian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#lithuanian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Lithuanian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#bilingual-english-lithuanian-parallel-corpus-from-seimas-of-the-republic-of-lithuania-website" class="md-nav__link">
+    <span class="md-ellipsis">
+      Bilingual English-Lithuanian parallel corpus from Seimas of the Republic of Lithuania website
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lt-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lt; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lt-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lt; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lt-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lt; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lt-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lt; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lt-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lt; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lt-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lt; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lt-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lt; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lt-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lt; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lt-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lt; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lt-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lt; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lt-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lt; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lt-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lt; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-lt" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [lt]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-lt" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [lt]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-lt" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [lt]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-lt" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [lt]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-lt" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [lt]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-lt" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [lt]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="lithuanian-datasets">Lithuanian Datasets</h1>
+<p>There are in total 19 datasets with 5 B tokens in Lithuanian language.</p>
+<h2 id="bilingual-english-lithuanian-parallel-corpus-from-seimas-of-the-republic-of-lithuania-website">Bilingual English-Lithuanian parallel corpus from Seimas of the Republic of Lithuania website</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>seimas_lt_en</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Bilingual English-Lithuanian parallel corpus from Seimas of the Republic of Lithuania website</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Contents of http://www.lrs.lt were crawled, aligned on document and sentence level and converted into a parallel corpus.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://live.european-language-grid.eu/catalogue/corpus/3009/download/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Open under PSI (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>48 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lt-2015-14">Colossal OSCAR 1 [lt; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lt; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lt-2016-40">Colossal OSCAR 1 [lt; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lt; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lt-2017-43">Colossal OSCAR 1 [lt; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lt; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lt-2018-47">Colossal OSCAR 1 [lt; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lt; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lt-2019-22">Colossal OSCAR 1 [lt; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lt; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lt-2020-24">Colossal OSCAR 1 [lt; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lt; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lt-2020-45">Colossal OSCAR 1 [lt; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lt; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lt-2021-49">Colossal OSCAR 1 [lt; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lt; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lt-2022-27">Colossal OSCAR 1 [lt; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lt; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lt-2022-49">Colossal OSCAR 1 [lt; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lt; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lt-2023-14">Colossal OSCAR 1 [lt; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lt; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lt-2023-23">Colossal OSCAR 1 [lt; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lt; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-lt">EurlexResources [lt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [lt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-lt">LegalMC4 [lt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [lt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>9 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-lt">Wikibooks [lt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [lt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>594 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-lt">Wikipedia [lt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [lt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>33 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-lt">Wikiquote [lt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [lt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-lt">Wikisource [lt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_lt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [lt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_lv/index.html b/datasets/language_lv/index.html
new file mode 100644
index 0000000..bacc823
--- /dev/null
+++ b/datasets/language_lv/index.html
@@ -0,0 +1,1369 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_lv/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Latvian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#latvian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Latvian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lv-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lv; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lv-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lv; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lv-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lv; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lv-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lv; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lv-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lv; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lv-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lv; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lv-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lv; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lv-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lv; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lv-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lv; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lv-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lv; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lv-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lv; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-lv-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [lv; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#corpus-of-state-related-content-from-the-latvian-web-processed" class="md-nav__link">
+    <span class="md-ellipsis">
+      Corpus of State-related content from the Latvian Web (Processed)
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-lv" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [lv]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-lv" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [lv]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-lv" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [lv]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-lv" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [lv]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="latvian-datasets">Latvian Datasets</h1>
+<p>There are in total 17 datasets with 4 B tokens in Latvian language.</p>
+<h2 id="colossal-oscar-1-lv-2015-14">Colossal OSCAR 1 [lv; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lv; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lv-2016-40">Colossal OSCAR 1 [lv; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lv; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lv-2017-43">Colossal OSCAR 1 [lv; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lv; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lv-2018-47">Colossal OSCAR 1 [lv; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lv; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lv-2019-22">Colossal OSCAR 1 [lv; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lv; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lv-2020-24">Colossal OSCAR 1 [lv; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lv; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lv-2020-45">Colossal OSCAR 1 [lv; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lv; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lv-2021-49">Colossal OSCAR 1 [lv; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lv; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lv-2022-27">Colossal OSCAR 1 [lv; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lv; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lv-2022-49">Colossal OSCAR 1 [lv; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lv; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lv-2023-14">Colossal OSCAR 1 [lv; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lv; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-lv-2023-23">Colossal OSCAR 1 [lv; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [lv; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>792 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="corpus-of-state-related-content-from-the-latvian-web-processed">Corpus of State-related content from the Latvian Web (Processed)</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>state_related_latvian_web</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Corpus of State-related content from the Latvian Web (Processed)</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Latvian Web, home pages of ministries and state public services, army, etc. were crawled, and parallel Latvian-English content was collected.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[http://catalog.elra.info/en-us/repository/browse/ELRA-W0169/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC-BY-SA-4.0 (commercial use: True, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-lv">EurlexResources [lv]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [lv]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-lv">LegalMC4 [lv]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [lv]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>59 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-lv">Wikibooks [lv]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [lv]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>33 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-lv">Wikipedia [lv]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_lv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [lv]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>29 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_mai/index.html b/datasets/language_mai/index.html
new file mode 100644
index 0000000..d74b96f
--- /dev/null
+++ b/datasets/language_mai/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_mai/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Mai Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#mai-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Mai Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mai-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mai; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mai-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mai; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mai-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mai; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mai-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mai; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mai-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mai; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mai-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mai; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mai-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mai; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mai-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mai; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mai-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mai; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mai-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mai; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mai-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mai; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mai-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mai; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="mai-datasets">Mai Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Mai language.</p>
+<h2 id="colossal-oscar-1-mai-2015-14">Colossal OSCAR 1 [mai; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_mai</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mai; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mai-2016-40">Colossal OSCAR 1 [mai; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_mai</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mai; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mai-2017-43">Colossal OSCAR 1 [mai; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_mai</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mai; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mai-2018-47">Colossal OSCAR 1 [mai; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_mai</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mai; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mai-2019-22">Colossal OSCAR 1 [mai; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_mai</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mai; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mai-2020-24">Colossal OSCAR 1 [mai; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_mai</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mai; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mai-2020-45">Colossal OSCAR 1 [mai; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_mai</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mai; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mai-2021-49">Colossal OSCAR 1 [mai; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_mai</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mai; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mai-2022-27">Colossal OSCAR 1 [mai; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_mai</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mai; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mai-2022-49">Colossal OSCAR 1 [mai; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_mai</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mai; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mai-2023-14">Colossal OSCAR 1 [mai; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_mai</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mai; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mai-2023-23">Colossal OSCAR 1 [mai; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_mai</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mai; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_mg/index.html b/datasets/language_mg/index.html
new file mode 100644
index 0000000..9788719
--- /dev/null
+++ b/datasets/language_mg/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_mg/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Malagasy Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#malagasy-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Malagasy Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mg-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mg; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mg-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mg; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mg-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mg; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mg-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mg; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mg-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mg; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mg-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mg; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mg-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mg; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mg-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mg; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mg-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mg; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mg-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mg; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mg-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mg; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mg-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mg; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="malagasy-datasets">Malagasy Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Malagasy language.</p>
+<h2 id="colossal-oscar-1-mg-2015-14">Colossal OSCAR 1 [mg; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_mg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mg; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mg-2016-40">Colossal OSCAR 1 [mg; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_mg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mg; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mg-2017-43">Colossal OSCAR 1 [mg; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_mg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mg; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mg-2018-47">Colossal OSCAR 1 [mg; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_mg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mg; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mg-2019-22">Colossal OSCAR 1 [mg; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_mg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mg; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mg-2020-24">Colossal OSCAR 1 [mg; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_mg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mg; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mg-2020-45">Colossal OSCAR 1 [mg; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_mg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mg; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mg-2021-49">Colossal OSCAR 1 [mg; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_mg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mg; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mg-2022-27">Colossal OSCAR 1 [mg; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_mg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mg; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mg-2022-49">Colossal OSCAR 1 [mg; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_mg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mg; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mg-2023-14">Colossal OSCAR 1 [mg; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_mg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mg; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mg-2023-23">Colossal OSCAR 1 [mg; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_mg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mg; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_mhr/index.html b/datasets/language_mhr/index.html
new file mode 100644
index 0000000..a2802fe
--- /dev/null
+++ b/datasets/language_mhr/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_mhr/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Mhr Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#mhr-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Mhr Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mhr-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mhr; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mhr-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mhr; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mhr-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mhr; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mhr-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mhr; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mhr-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mhr; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mhr-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mhr; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mhr-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mhr; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mhr-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mhr; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mhr-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mhr; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mhr-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mhr; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mhr-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mhr; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mhr-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mhr; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="mhr-datasets">Mhr Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Mhr language.</p>
+<h2 id="colossal-oscar-1-mhr-2015-14">Colossal OSCAR 1 [mhr; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_mhr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mhr; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mhr-2016-40">Colossal OSCAR 1 [mhr; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_mhr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mhr; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mhr-2017-43">Colossal OSCAR 1 [mhr; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_mhr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mhr; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mhr-2018-47">Colossal OSCAR 1 [mhr; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_mhr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mhr; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mhr-2019-22">Colossal OSCAR 1 [mhr; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_mhr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mhr; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mhr-2020-24">Colossal OSCAR 1 [mhr; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_mhr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mhr; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mhr-2020-45">Colossal OSCAR 1 [mhr; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_mhr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mhr; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mhr-2021-49">Colossal OSCAR 1 [mhr; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_mhr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mhr; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mhr-2022-27">Colossal OSCAR 1 [mhr; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_mhr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mhr; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mhr-2022-49">Colossal OSCAR 1 [mhr; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_mhr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mhr; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mhr-2023-14">Colossal OSCAR 1 [mhr; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_mhr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mhr; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mhr-2023-23">Colossal OSCAR 1 [mhr; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_mhr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mhr; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_min/index.html b/datasets/language_min/index.html
new file mode 100644
index 0000000..d30375a
--- /dev/null
+++ b/datasets/language_min/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_min/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Min Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#min-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Min Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-min-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [min; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-min-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [min; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-min-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [min; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-min-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [min; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-min-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [min; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-min-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [min; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-min-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [min; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-min-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [min; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-min-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [min; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-min-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [min; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-min-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [min; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-min-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [min; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="min-datasets">Min Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Min language.</p>
+<h2 id="colossal-oscar-1-min-2015-14">Colossal OSCAR 1 [min; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_min</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [min; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-min-2016-40">Colossal OSCAR 1 [min; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_min</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [min; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-min-2017-43">Colossal OSCAR 1 [min; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_min</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [min; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-min-2018-47">Colossal OSCAR 1 [min; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_min</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [min; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-min-2019-22">Colossal OSCAR 1 [min; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_min</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [min; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-min-2020-24">Colossal OSCAR 1 [min; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_min</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [min; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-min-2020-45">Colossal OSCAR 1 [min; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_min</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [min; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-min-2021-49">Colossal OSCAR 1 [min; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_min</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [min; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-min-2022-27">Colossal OSCAR 1 [min; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_min</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [min; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-min-2022-49">Colossal OSCAR 1 [min; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_min</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [min; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-min-2023-14">Colossal OSCAR 1 [min; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_min</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [min; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-min-2023-23">Colossal OSCAR 1 [min; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_min</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [min; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_mk/index.html b/datasets/language_mk/index.html
new file mode 100644
index 0000000..753dbb8
--- /dev/null
+++ b/datasets/language_mk/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_mk/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Macedonian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#macedonian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Macedonian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mk-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mk; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mk-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mk; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mk-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mk; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mk-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mk; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mk-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mk; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mk-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mk; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mk-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mk; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mk-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mk; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mk-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mk; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mk-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mk; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mk-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mk; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mk-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mk; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="macedonian-datasets">Macedonian Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Macedonian language.</p>
+<h2 id="colossal-oscar-1-mk-2015-14">Colossal OSCAR 1 [mk; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_mk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mk; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mk-2016-40">Colossal OSCAR 1 [mk; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_mk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mk; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mk-2017-43">Colossal OSCAR 1 [mk; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_mk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mk; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mk-2018-47">Colossal OSCAR 1 [mk; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_mk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mk; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mk-2019-22">Colossal OSCAR 1 [mk; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_mk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mk; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mk-2020-24">Colossal OSCAR 1 [mk; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_mk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mk; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mk-2020-45">Colossal OSCAR 1 [mk; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_mk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mk; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mk-2021-49">Colossal OSCAR 1 [mk; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_mk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mk; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mk-2022-27">Colossal OSCAR 1 [mk; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_mk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mk; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mk-2022-49">Colossal OSCAR 1 [mk; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_mk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mk; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mk-2023-14">Colossal OSCAR 1 [mk; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_mk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mk; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mk-2023-23">Colossal OSCAR 1 [mk; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_mk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mk; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ml/index.html b/datasets/language_ml/index.html
new file mode 100644
index 0000000..94abb54
--- /dev/null
+++ b/datasets/language_ml/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ml/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Malayalam Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#malayalam-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Malayalam Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ml-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ml; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ml-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ml; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ml-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ml; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ml-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ml; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ml-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ml; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ml-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ml; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ml-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ml; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ml-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ml; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ml-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ml; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ml-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ml; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ml-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ml; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ml-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ml; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="malayalam-datasets">Malayalam Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Malayalam language.</p>
+<h2 id="colossal-oscar-1-ml-2015-14">Colossal OSCAR 1 [ml; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ml; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ml-2016-40">Colossal OSCAR 1 [ml; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ml; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ml-2017-43">Colossal OSCAR 1 [ml; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ml; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ml-2018-47">Colossal OSCAR 1 [ml; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ml; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ml-2019-22">Colossal OSCAR 1 [ml; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ml; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ml-2020-24">Colossal OSCAR 1 [ml; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ml; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ml-2020-45">Colossal OSCAR 1 [ml; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ml; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ml-2021-49">Colossal OSCAR 1 [ml; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ml; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ml-2022-27">Colossal OSCAR 1 [ml; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ml; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ml-2022-49">Colossal OSCAR 1 [ml; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ml; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ml-2023-14">Colossal OSCAR 1 [ml; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ml; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ml-2023-23">Colossal OSCAR 1 [ml; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ml; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_mn/index.html b/datasets/language_mn/index.html
new file mode 100644
index 0000000..33aacde
--- /dev/null
+++ b/datasets/language_mn/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_mn/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Mongolian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#mongolian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Mongolian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mn-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mn; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mn-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mn; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mn-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mn; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mn-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mn; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mn-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mn; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mn-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mn; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mn-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mn; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mn-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mn; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mn-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mn; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mn-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mn; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mn-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mn; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mn-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mn; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="mongolian-datasets">Mongolian Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Mongolian language.</p>
+<h2 id="colossal-oscar-1-mn-2015-14">Colossal OSCAR 1 [mn; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_mn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mn; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mn-2016-40">Colossal OSCAR 1 [mn; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_mn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mn; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mn-2017-43">Colossal OSCAR 1 [mn; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_mn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mn; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mn-2018-47">Colossal OSCAR 1 [mn; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_mn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mn; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mn-2019-22">Colossal OSCAR 1 [mn; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_mn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mn; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mn-2020-24">Colossal OSCAR 1 [mn; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_mn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mn; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mn-2020-45">Colossal OSCAR 1 [mn; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_mn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mn; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mn-2021-49">Colossal OSCAR 1 [mn; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_mn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mn; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mn-2022-27">Colossal OSCAR 1 [mn; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_mn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mn; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mn-2022-49">Colossal OSCAR 1 [mn; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_mn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mn; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mn-2023-14">Colossal OSCAR 1 [mn; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_mn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mn; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mn-2023-23">Colossal OSCAR 1 [mn; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_mn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mn; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_mr/index.html b/datasets/language_mr/index.html
new file mode 100644
index 0000000..dbcc2d7
--- /dev/null
+++ b/datasets/language_mr/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_mr/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Marathi Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#marathi-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Marathi Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mr-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mr; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mr-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mr; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mr-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mr; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mr-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mr; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mr-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mr; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mr-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mr; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mr-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mr; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mr-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mr; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mr-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mr; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mr-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mr; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mr-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mr; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mr-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mr; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="marathi-datasets">Marathi Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Marathi language.</p>
+<h2 id="colossal-oscar-1-mr-2015-14">Colossal OSCAR 1 [mr; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_mr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mr; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mr-2016-40">Colossal OSCAR 1 [mr; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_mr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mr; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mr-2017-43">Colossal OSCAR 1 [mr; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_mr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mr; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mr-2018-47">Colossal OSCAR 1 [mr; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_mr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mr; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mr-2019-22">Colossal OSCAR 1 [mr; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_mr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mr; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mr-2020-24">Colossal OSCAR 1 [mr; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_mr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mr; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mr-2020-45">Colossal OSCAR 1 [mr; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_mr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mr; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mr-2021-49">Colossal OSCAR 1 [mr; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_mr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mr; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mr-2022-27">Colossal OSCAR 1 [mr; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_mr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mr; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mr-2022-49">Colossal OSCAR 1 [mr; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_mr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mr; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mr-2023-14">Colossal OSCAR 1 [mr; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_mr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mr; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mr-2023-23">Colossal OSCAR 1 [mr; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_mr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mr; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_mrj/index.html b/datasets/language_mrj/index.html
new file mode 100644
index 0000000..23b193f
--- /dev/null
+++ b/datasets/language_mrj/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_mrj/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Mrj Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#mrj-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Mrj Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mrj-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mrj; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mrj-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mrj; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mrj-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mrj; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mrj-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mrj; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mrj-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mrj; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mrj-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mrj; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mrj-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mrj; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mrj-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mrj; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mrj-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mrj; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mrj-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mrj; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mrj-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mrj; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mrj-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mrj; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="mrj-datasets">Mrj Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Mrj language.</p>
+<h2 id="colossal-oscar-1-mrj-2015-14">Colossal OSCAR 1 [mrj; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_mrj</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mrj; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mrj-2016-40">Colossal OSCAR 1 [mrj; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_mrj</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mrj; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mrj-2017-43">Colossal OSCAR 1 [mrj; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_mrj</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mrj; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mrj-2018-47">Colossal OSCAR 1 [mrj; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_mrj</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mrj; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mrj-2019-22">Colossal OSCAR 1 [mrj; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_mrj</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mrj; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mrj-2020-24">Colossal OSCAR 1 [mrj; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_mrj</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mrj; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mrj-2020-45">Colossal OSCAR 1 [mrj; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_mrj</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mrj; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mrj-2021-49">Colossal OSCAR 1 [mrj; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_mrj</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mrj; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mrj-2022-27">Colossal OSCAR 1 [mrj; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_mrj</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mrj; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mrj-2022-49">Colossal OSCAR 1 [mrj; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_mrj</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mrj; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mrj-2023-14">Colossal OSCAR 1 [mrj; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_mrj</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mrj; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mrj-2023-23">Colossal OSCAR 1 [mrj; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_mrj</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mrj; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ms/index.html b/datasets/language_ms/index.html
new file mode 100644
index 0000000..1bda774
--- /dev/null
+++ b/datasets/language_ms/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ms/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Malay Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#malay-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Malay Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ms-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ms; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ms-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ms; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ms-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ms; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ms-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ms; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ms-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ms; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ms-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ms; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ms-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ms; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ms-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ms; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ms-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ms; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ms-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ms; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ms-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ms; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ms-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ms; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="malay-datasets">Malay Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Malay language.</p>
+<h2 id="colossal-oscar-1-ms-2015-14">Colossal OSCAR 1 [ms; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ms; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ms-2016-40">Colossal OSCAR 1 [ms; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ms; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ms-2017-43">Colossal OSCAR 1 [ms; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ms; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ms-2018-47">Colossal OSCAR 1 [ms; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ms; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ms-2019-22">Colossal OSCAR 1 [ms; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ms; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ms-2020-24">Colossal OSCAR 1 [ms; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ms; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ms-2020-45">Colossal OSCAR 1 [ms; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ms; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ms-2021-49">Colossal OSCAR 1 [ms; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ms; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ms-2022-27">Colossal OSCAR 1 [ms; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ms; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ms-2022-49">Colossal OSCAR 1 [ms; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ms; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ms-2023-14">Colossal OSCAR 1 [ms; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ms; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ms-2023-23">Colossal OSCAR 1 [ms; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ms; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_mt/index.html b/datasets/language_mt/index.html
new file mode 100644
index 0000000..73ef2e6
--- /dev/null
+++ b/datasets/language_mt/index.html
@@ -0,0 +1,1369 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_mt/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Maltese Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#maltese-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Maltese Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mt-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mt; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mt-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mt; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mt-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mt; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mt-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mt; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mt-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mt; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mt-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mt; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mt-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mt; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mt-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mt; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mt-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mt; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mt-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mt; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mt-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mt; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mt-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mt; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-mt" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [mt]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#korpus-malti" class="md-nav__link">
+    <span class="md-ellipsis">
+      Korpus Malti
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-mt" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [mt]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#macocu-web-corpus-maltese-20" class="md-nav__link">
+    <span class="md-ellipsis">
+      MaCoCu web corpus [Maltese 2.0]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-mt" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [mt]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="maltese-datasets">Maltese Datasets</h1>
+<p>There are in total 17 datasets with 4 B tokens in Maltese language.</p>
+<h2 id="colossal-oscar-1-mt-2015-14">Colossal OSCAR 1 [mt; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mt; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mt-2016-40">Colossal OSCAR 1 [mt; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mt; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mt-2017-43">Colossal OSCAR 1 [mt; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mt; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mt-2018-47">Colossal OSCAR 1 [mt; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mt; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mt-2019-22">Colossal OSCAR 1 [mt; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mt; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mt-2020-24">Colossal OSCAR 1 [mt; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mt; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mt-2020-45">Colossal OSCAR 1 [mt; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mt; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mt-2021-49">Colossal OSCAR 1 [mt; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mt; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mt-2022-27">Colossal OSCAR 1 [mt; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mt; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mt-2022-49">Colossal OSCAR 1 [mt; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mt; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mt-2023-14">Colossal OSCAR 1 [mt; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mt; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mt-2023-23">Colossal OSCAR 1 [mt; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mt; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>193 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-mt">EurlexResources [mt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [mt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="korpus-malti">Korpus Malti</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>korpus_malti</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Korpus Malti</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>General Corpora for the Maltese Language. This dataset is composed of texts from various genres/domains written in Maltese.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/MLRS/korpus_malti]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License (DFKI has a permission for LLM training with commercial license) (commercial use: False, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>366 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-mt">LegalMC4 [mt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [mt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="macocu-web-corpus-maltese-20">MaCoCu web corpus [Maltese 2.0]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>macocu_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>MaCoCu web corpus [Maltese 2.0]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://www.clarin.si/repository/xmlui/handle/11356/1803]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC0-No Rights Reserved (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>348 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-mt">Wikipedia [mt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_mt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [mt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>6 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_multi/index.html b/datasets/language_multi/index.html
new file mode 100644
index 0000000..15b60e1
--- /dev/null
+++ b/datasets/language_multi/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_multi/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Multi Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#multi-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Multi Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-multi-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [multi; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-multi-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [multi; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-multi-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [multi; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-multi-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [multi; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-multi-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [multi; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-multi-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [multi; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-multi-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [multi; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-multi-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [multi; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-multi-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [multi; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-multi-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [multi; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-multi-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [multi; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-multi-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [multi; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="multi-datasets">Multi Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Multi language.</p>
+<h2 id="colossal-oscar-1-multi-2015-14">Colossal OSCAR 1 [multi; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_multi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [multi; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-multi-2016-40">Colossal OSCAR 1 [multi; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_multi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [multi; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-multi-2017-43">Colossal OSCAR 1 [multi; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_multi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [multi; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-multi-2018-47">Colossal OSCAR 1 [multi; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_multi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [multi; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-multi-2019-22">Colossal OSCAR 1 [multi; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_multi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [multi; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-multi-2020-24">Colossal OSCAR 1 [multi; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_multi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [multi; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-multi-2020-45">Colossal OSCAR 1 [multi; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_multi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [multi; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-multi-2021-49">Colossal OSCAR 1 [multi; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_multi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [multi; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-multi-2022-27">Colossal OSCAR 1 [multi; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_multi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [multi; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-multi-2022-49">Colossal OSCAR 1 [multi; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_multi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [multi; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-multi-2023-14">Colossal OSCAR 1 [multi; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_multi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [multi; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-multi-2023-23">Colossal OSCAR 1 [multi; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_multi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [multi; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_mwl/index.html b/datasets/language_mwl/index.html
new file mode 100644
index 0000000..b258984
--- /dev/null
+++ b/datasets/language_mwl/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_mwl/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Mwl Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#mwl-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Mwl Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mwl-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mwl; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mwl-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mwl; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mwl-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mwl; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mwl-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mwl; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mwl-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mwl; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mwl-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mwl; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mwl-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mwl; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mwl-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mwl; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mwl-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mwl; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mwl-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mwl; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mwl-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mwl; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mwl-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mwl; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="mwl-datasets">Mwl Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Mwl language.</p>
+<h2 id="colossal-oscar-1-mwl-2015-14">Colossal OSCAR 1 [mwl; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_mwl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mwl; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mwl-2016-40">Colossal OSCAR 1 [mwl; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_mwl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mwl; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mwl-2017-43">Colossal OSCAR 1 [mwl; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_mwl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mwl; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mwl-2018-47">Colossal OSCAR 1 [mwl; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_mwl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mwl; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mwl-2019-22">Colossal OSCAR 1 [mwl; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_mwl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mwl; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mwl-2020-24">Colossal OSCAR 1 [mwl; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_mwl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mwl; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mwl-2020-45">Colossal OSCAR 1 [mwl; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_mwl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mwl; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mwl-2021-49">Colossal OSCAR 1 [mwl; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_mwl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mwl; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mwl-2022-27">Colossal OSCAR 1 [mwl; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_mwl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mwl; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mwl-2022-49">Colossal OSCAR 1 [mwl; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_mwl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mwl; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mwl-2023-14">Colossal OSCAR 1 [mwl; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_mwl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mwl; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mwl-2023-23">Colossal OSCAR 1 [mwl; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_mwl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mwl; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_my/index.html b/datasets/language_my/index.html
new file mode 100644
index 0000000..e32b8bb
--- /dev/null
+++ b/datasets/language_my/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_my/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Burmese Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#burmese-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Burmese Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-my-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [my; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-my-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [my; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-my-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [my; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-my-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [my; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-my-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [my; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-my-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [my; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-my-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [my; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-my-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [my; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-my-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [my; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-my-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [my; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-my-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [my; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-my-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [my; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="burmese-datasets">Burmese Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Burmese language.</p>
+<h2 id="colossal-oscar-1-my-2015-14">Colossal OSCAR 1 [my; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_my</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [my; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-my-2016-40">Colossal OSCAR 1 [my; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_my</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [my; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-my-2017-43">Colossal OSCAR 1 [my; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_my</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [my; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-my-2018-47">Colossal OSCAR 1 [my; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_my</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [my; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-my-2019-22">Colossal OSCAR 1 [my; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_my</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [my; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-my-2020-24">Colossal OSCAR 1 [my; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_my</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [my; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-my-2020-45">Colossal OSCAR 1 [my; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_my</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [my; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-my-2021-49">Colossal OSCAR 1 [my; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_my</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [my; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-my-2022-27">Colossal OSCAR 1 [my; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_my</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [my; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-my-2022-49">Colossal OSCAR 1 [my; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_my</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [my; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-my-2023-14">Colossal OSCAR 1 [my; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_my</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [my; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-my-2023-23">Colossal OSCAR 1 [my; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_my</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [my; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_mzn/index.html b/datasets/language_mzn/index.html
new file mode 100644
index 0000000..2257915
--- /dev/null
+++ b/datasets/language_mzn/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_mzn/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Mzn Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#mzn-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Mzn Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mzn-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mzn; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mzn-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mzn; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mzn-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mzn; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mzn-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mzn; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mzn-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mzn; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mzn-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mzn; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mzn-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mzn; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mzn-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mzn; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mzn-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mzn; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mzn-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mzn; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mzn-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mzn; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-mzn-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [mzn; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="mzn-datasets">Mzn Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Mzn language.</p>
+<h2 id="colossal-oscar-1-mzn-2015-14">Colossal OSCAR 1 [mzn; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_mzn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mzn; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mzn-2016-40">Colossal OSCAR 1 [mzn; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_mzn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mzn; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mzn-2017-43">Colossal OSCAR 1 [mzn; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_mzn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mzn; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mzn-2018-47">Colossal OSCAR 1 [mzn; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_mzn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mzn; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mzn-2019-22">Colossal OSCAR 1 [mzn; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_mzn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mzn; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mzn-2020-24">Colossal OSCAR 1 [mzn; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_mzn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mzn; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mzn-2020-45">Colossal OSCAR 1 [mzn; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_mzn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mzn; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mzn-2021-49">Colossal OSCAR 1 [mzn; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_mzn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mzn; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mzn-2022-27">Colossal OSCAR 1 [mzn; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_mzn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mzn; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mzn-2022-49">Colossal OSCAR 1 [mzn; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_mzn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mzn; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mzn-2023-14">Colossal OSCAR 1 [mzn; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_mzn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mzn; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-mzn-2023-23">Colossal OSCAR 1 [mzn; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_mzn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [mzn; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_nah/index.html b/datasets/language_nah/index.html
new file mode 100644
index 0000000..a6246e6
--- /dev/null
+++ b/datasets/language_nah/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_nah/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Nah Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#nah-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Nah Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nah-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nah; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nah-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nah; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nah-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nah; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nah-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nah; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nah-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nah; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nah-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nah; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nah-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nah; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nah-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nah; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nah-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nah; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nah-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nah; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nah-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nah; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nah-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nah; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="nah-datasets">Nah Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Nah language.</p>
+<h2 id="colossal-oscar-1-nah-2015-14">Colossal OSCAR 1 [nah; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_nah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nah; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nah-2016-40">Colossal OSCAR 1 [nah; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_nah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nah; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nah-2017-43">Colossal OSCAR 1 [nah; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_nah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nah; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nah-2018-47">Colossal OSCAR 1 [nah; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_nah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nah; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nah-2019-22">Colossal OSCAR 1 [nah; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_nah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nah; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nah-2020-24">Colossal OSCAR 1 [nah; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_nah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nah; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nah-2020-45">Colossal OSCAR 1 [nah; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_nah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nah; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nah-2021-49">Colossal OSCAR 1 [nah; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_nah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nah; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nah-2022-27">Colossal OSCAR 1 [nah; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_nah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nah; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nah-2022-49">Colossal OSCAR 1 [nah; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_nah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nah; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nah-2023-14">Colossal OSCAR 1 [nah; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_nah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nah; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nah-2023-23">Colossal OSCAR 1 [nah; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_nah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nah; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_nds/index.html b/datasets/language_nds/index.html
new file mode 100644
index 0000000..bb1e6b7
--- /dev/null
+++ b/datasets/language_nds/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_nds/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Nds Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#nds-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Nds Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nds-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nds; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nds-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nds; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nds-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nds; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nds-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nds; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nds-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nds; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nds-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nds; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nds-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nds; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nds-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nds; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nds-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nds; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nds-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nds; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nds-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nds; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nds-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nds; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="nds-datasets">Nds Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Nds language.</p>
+<h2 id="colossal-oscar-1-nds-2015-14">Colossal OSCAR 1 [nds; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_nds</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nds; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nds-2016-40">Colossal OSCAR 1 [nds; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_nds</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nds; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nds-2017-43">Colossal OSCAR 1 [nds; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_nds</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nds; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nds-2018-47">Colossal OSCAR 1 [nds; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_nds</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nds; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nds-2019-22">Colossal OSCAR 1 [nds; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_nds</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nds; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nds-2020-24">Colossal OSCAR 1 [nds; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_nds</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nds; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nds-2020-45">Colossal OSCAR 1 [nds; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_nds</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nds; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nds-2021-49">Colossal OSCAR 1 [nds; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_nds</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nds; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nds-2022-27">Colossal OSCAR 1 [nds; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_nds</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nds; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nds-2022-49">Colossal OSCAR 1 [nds; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_nds</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nds; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nds-2023-14">Colossal OSCAR 1 [nds; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_nds</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nds; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nds-2023-23">Colossal OSCAR 1 [nds; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_nds</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nds; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ne/index.html b/datasets/language_ne/index.html
new file mode 100644
index 0000000..8c8a613
--- /dev/null
+++ b/datasets/language_ne/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ne/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Nepali Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#nepali-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Nepali Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ne-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ne; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ne-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ne; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ne-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ne; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ne-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ne; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ne-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ne; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ne-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ne; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ne-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ne; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ne-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ne; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ne-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ne; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ne-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ne; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ne-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ne; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ne-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ne; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="nepali-datasets">Nepali Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Nepali language.</p>
+<h2 id="colossal-oscar-1-ne-2015-14">Colossal OSCAR 1 [ne; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ne</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ne; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ne-2016-40">Colossal OSCAR 1 [ne; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ne</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ne; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ne-2017-43">Colossal OSCAR 1 [ne; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ne</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ne; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ne-2018-47">Colossal OSCAR 1 [ne; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ne</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ne; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ne-2019-22">Colossal OSCAR 1 [ne; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ne</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ne; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ne-2020-24">Colossal OSCAR 1 [ne; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ne</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ne; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ne-2020-45">Colossal OSCAR 1 [ne; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ne</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ne; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ne-2021-49">Colossal OSCAR 1 [ne; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ne</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ne; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ne-2022-27">Colossal OSCAR 1 [ne; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ne</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ne; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ne-2022-49">Colossal OSCAR 1 [ne; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ne</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ne; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ne-2023-14">Colossal OSCAR 1 [ne; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ne</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ne; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ne-2023-23">Colossal OSCAR 1 [ne; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ne</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ne; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_new/index.html b/datasets/language_new/index.html
new file mode 100644
index 0000000..97ff27b
--- /dev/null
+++ b/datasets/language_new/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_new/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>New Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#new-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              New Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-new-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [new; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-new-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [new; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-new-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [new; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-new-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [new; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-new-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [new; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-new-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [new; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-new-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [new; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-new-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [new; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-new-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [new; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-new-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [new; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-new-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [new; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-new-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [new; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="new-datasets">New Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in New language.</p>
+<h2 id="colossal-oscar-1-new-2015-14">Colossal OSCAR 1 [new; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_new</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [new; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-new-2016-40">Colossal OSCAR 1 [new; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_new</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [new; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-new-2017-43">Colossal OSCAR 1 [new; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_new</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [new; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-new-2018-47">Colossal OSCAR 1 [new; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_new</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [new; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-new-2019-22">Colossal OSCAR 1 [new; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_new</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [new; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-new-2020-24">Colossal OSCAR 1 [new; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_new</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [new; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-new-2020-45">Colossal OSCAR 1 [new; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_new</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [new; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-new-2021-49">Colossal OSCAR 1 [new; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_new</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [new; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-new-2022-27">Colossal OSCAR 1 [new; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_new</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [new; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-new-2022-49">Colossal OSCAR 1 [new; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_new</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [new; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-new-2023-14">Colossal OSCAR 1 [new; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_new</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [new; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-new-2023-23">Colossal OSCAR 1 [new; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_new</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [new; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_nl/index.html b/datasets/language_nl/index.html
new file mode 100644
index 0000000..7e28361
--- /dev/null
+++ b/datasets/language_nl/index.html
@@ -0,0 +1,1765 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_nl/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Dutch Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#dutch-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Dutch Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nl-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nl; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nl-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nl; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nl-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nl; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nl-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nl; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nl-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nl; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nl-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nl; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nl-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nl; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nl-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nl; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nl-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nl; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nl-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nl; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nl-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nl; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nl-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nl; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-nl" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [nl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-nl" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [nl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#sonar-corpus-nc-12" class="md-nav__link">
+    <span class="md-ellipsis">
+      SoNaR Corpus NC 1.2
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#sonar-corpus-nc-12_1" class="md-nav__link">
+    <span class="md-ellipsis">
+      SoNaR Corpus NC 1.2
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#sonar-corpus-nc-12_2" class="md-nav__link">
+    <span class="md-ellipsis">
+      SoNaR Corpus NC 1.2
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#sonar-corpus-nc-12_3" class="md-nav__link">
+    <span class="md-ellipsis">
+      SoNaR Corpus NC 1.2
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#sonar-corpus-nc-12_4" class="md-nav__link">
+    <span class="md-ellipsis">
+      SoNaR Corpus NC 1.2
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#sonar-corpus-nc-12_5" class="md-nav__link">
+    <span class="md-ellipsis">
+      SoNaR Corpus NC 1.2
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-nl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [nl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-nl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [nl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-nl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [nl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-nl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [nl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-nl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [nl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-nl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [nl]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="dutch-datasets">Dutch Datasets</h1>
+<p>There are in total 26 datasets with 26 B tokens in Dutch language.</p>
+<h2 id="colossal-oscar-1-nl-2015-14">Colossal OSCAR 1 [nl; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nl; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nl-2016-40">Colossal OSCAR 1 [nl; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nl; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nl-2017-43">Colossal OSCAR 1 [nl; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nl; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nl-2018-47">Colossal OSCAR 1 [nl; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nl; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nl-2019-22">Colossal OSCAR 1 [nl; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nl; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nl-2020-24">Colossal OSCAR 1 [nl; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nl; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nl-2020-45">Colossal OSCAR 1 [nl; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nl; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nl-2021-49">Colossal OSCAR 1 [nl; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nl; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nl-2022-27">Colossal OSCAR 1 [nl; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nl; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nl-2022-49">Colossal OSCAR 1 [nl; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nl; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nl-2023-14">Colossal OSCAR 1 [nl; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nl; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nl-2023-23">Colossal OSCAR 1 [nl; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nl; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>17 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-nl">EurlexResources [nl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [nl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>8 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-nl">LegalMC4 [nl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [nl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>22 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="sonar-corpus-nc-12">SoNaR Corpus NC 1.2</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>sonar_subtitles</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>SoNaR Corpus NC 1.2</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The SoNaR Corpus contains more than 500 million words from texts in standard Dutch later than 1954. All texts were tokenized, tagged for part of speech and lemmatized. The named entities were also labelled. All annotations were produced automatically, no manual verification took place.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://taalmaterialen.ivdnt.org/download/tstc-sonar-corpus/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>unknown, likely research only or fair use (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="sonar-corpus-nc-12_1">SoNaR Corpus NC 1.2</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>sonar_edu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>SoNaR Corpus NC 1.2</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The SoNaR Corpus contains more than 500 million words from texts in standard Dutch later than 1954. All texts were tokenized, tagged for part of speech and lemmatized. The named entities were also labelled. All annotations were produced automatically, no manual verification took place.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://taalmaterialen.ivdnt.org/download/tstc-sonar-corpus/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>unknown, likely research only or fair use (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="sonar-corpus-nc-12_2">SoNaR Corpus NC 1.2</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>sonar_news</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>SoNaR Corpus NC 1.2</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The SoNaR Corpus contains more than 500 million words from texts in standard Dutch later than 1954. All texts were tokenized, tagged for part of speech and lemmatized. The named entities were also labelled. All annotations were produced automatically, no manual verification took place.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://taalmaterialen.ivdnt.org/download/tstc-sonar-corpus/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>unknown, likely research only or fair use (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="sonar-corpus-nc-12_3">SoNaR Corpus NC 1.2</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>sonar_books</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>SoNaR Corpus NC 1.2</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The SoNaR Corpus contains more than 500 million words from texts in standard Dutch later than 1954. All texts were tokenized, tagged for part of speech and lemmatized. The named entities were also labelled. All annotations were produced automatically, no manual verification took place.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://taalmaterialen.ivdnt.org/download/tstc-sonar-corpus/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>unknown, likely research only or fair use (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="sonar-corpus-nc-12_4">SoNaR Corpus NC 1.2</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>sonar_gov</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>SoNaR Corpus NC 1.2</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The SoNaR Corpus contains more than 500 million words from texts in standard Dutch later than 1954. All texts were tokenized, tagged for part of speech and lemmatized. The named entities were also labelled. All annotations were produced automatically, no manual verification took place.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://taalmaterialen.ivdnt.org/download/tstc-sonar-corpus/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>unknown, likely research only or fair use (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="sonar-corpus-nc-12_5">SoNaR Corpus NC 1.2</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>sonar_web</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>SoNaR Corpus NC 1.2</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The SoNaR Corpus contains more than 500 million words from texts in standard Dutch later than 1954. All texts were tokenized, tagged for part of speech and lemmatized. The named entities were also labelled. All annotations were produced automatically, no manual verification took place.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://taalmaterialen.ivdnt.org/download/tstc-sonar-corpus/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>unknown, likely research only or fair use (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>500 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-nl">Wikibooks [nl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [nl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>8 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-nl">Wikinews [nl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [nl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-nl">Wikipedia [nl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [nl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>263 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-nl">Wikiquote [nl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [nl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>75 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-nl">Wikisource [nl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [nl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>16 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-nl">Wikivoyage [nl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_nl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [nl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_nn/index.html b/datasets/language_nn/index.html
new file mode 100644
index 0000000..f06b4f9
--- /dev/null
+++ b/datasets/language_nn/index.html
@@ -0,0 +1,1281 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_nn/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Norwegian Nynorsk Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#norwegian-nynorsk-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Norwegian Nynorsk Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nn-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nn; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nn-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nn; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nn-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nn; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nn-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nn; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nn-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nn; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nn-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nn; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nn-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nn; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nn-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nn; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nn-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nn; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nn-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nn; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nn-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nn; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-nn-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [nn; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#norwegian-colossal-corpus" class="md-nav__link">
+    <span class="md-ellipsis">
+      Norwegian Colossal Corpus
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-nn" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [nn]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-nn" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [nn]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="norwegian-nynorsk-datasets">Norwegian Nynorsk Datasets</h1>
+<p>There are in total 15 datasets with 301 M tokens in Norwegian Nynorsk language.</p>
+<h2 id="colossal-oscar-1-nn-2015-14">Colossal OSCAR 1 [nn; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nn; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nn-2016-40">Colossal OSCAR 1 [nn; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nn; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nn-2017-43">Colossal OSCAR 1 [nn; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nn; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nn-2018-47">Colossal OSCAR 1 [nn; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nn; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nn-2019-22">Colossal OSCAR 1 [nn; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nn; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nn-2020-24">Colossal OSCAR 1 [nn; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nn; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nn-2020-45">Colossal OSCAR 1 [nn; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nn; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nn-2021-49">Colossal OSCAR 1 [nn; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nn; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nn-2022-27">Colossal OSCAR 1 [nn; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nn; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nn-2022-49">Colossal OSCAR 1 [nn; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nn; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nn-2023-14">Colossal OSCAR 1 [nn; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nn; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-nn-2023-23">Colossal OSCAR 1 [nn; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [nn; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>857 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="norwegian-colossal-corpus">Norwegian Colossal Corpus</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>norwegian_cc_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Norwegian Colossal Corpus</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The Norwegian Colossal Corpus is a collection of multiple smaller Norwegian corpuses suitable for training large language models. We have done extensive cleaning on the datasets, and have made them available in a common format. The total size of the NCC is currently 45GB. Documents: 20,830,348; Words/document: 331</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/NbAiLab/NCC]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed (NLOD 2.0, CC0 1.0, CC BY-NC 2.0, CC BY-SA 3.0) (commercial use: False, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>300 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-nn">Wikipedia [nn]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [nn]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-nn">Wikiquote [nn]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_nn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [nn]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_no/index.html b/datasets/language_no/index.html
new file mode 100644
index 0000000..e4b1520
--- /dev/null
+++ b/datasets/language_no/index.html
@@ -0,0 +1,1457 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_no/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Norwegian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#norwegian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Norwegian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-no-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [no; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-no-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [no; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-no-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [no; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-no-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [no; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-no-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [no; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-no-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [no; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-no-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [no; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-no-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [no; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-no-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [no; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-no-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [no; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-no-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [no; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-no-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [no; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#norwegian-colossal-corpus" class="md-nav__link">
+    <span class="md-ellipsis">
+      Norwegian Colossal Corpus
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-no" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [no]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-no" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [no]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-no" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [no]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-no" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [no]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-no" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [no]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-no" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [no]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="norwegian-datasets">Norwegian Datasets</h1>
+<p>There are in total 19 datasets with 5 B tokens in Norwegian language.</p>
+<h2 id="colossal-oscar-1-no-2015-14">Colossal OSCAR 1 [no; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [no; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-no-2016-40">Colossal OSCAR 1 [no; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [no; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-no-2017-43">Colossal OSCAR 1 [no; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [no; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-no-2018-47">Colossal OSCAR 1 [no; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [no; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-no-2019-22">Colossal OSCAR 1 [no; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [no; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-no-2020-24">Colossal OSCAR 1 [no; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [no; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-no-2020-45">Colossal OSCAR 1 [no; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [no; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-no-2021-49">Colossal OSCAR 1 [no; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [no; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-no-2022-27">Colossal OSCAR 1 [no; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [no; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-no-2022-49">Colossal OSCAR 1 [no; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [no; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-no-2023-14">Colossal OSCAR 1 [no; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [no; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-no-2023-23">Colossal OSCAR 1 [no; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [no; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>345 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="norwegian-colossal-corpus">Norwegian Colossal Corpus</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>norwegian_cc_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Norwegian Colossal Corpus</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The Norwegian Colossal Corpus is a collection of multiple smaller Norwegian corpuses suitable for training large language models. We have done extensive cleaning on the datasets, and have made them available in a common format. The total size of the NCC is currently 45GB. Documents: 20,830,348; Words/document: 331</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/NbAiLab/NCC]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>mixed (NLOD 2.0, CC0 1.0, CC BY-NC 2.0, CC BY-SA 3.0) (commercial use: False, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>5 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-no">Wikibooks [no]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [no]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-no">Wikinews [no]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [no]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-no">Wikipedia [no]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [no]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-no">Wikiquote [no]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [no]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-no">Wikisource [no]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [no]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-no">Wikivoyage [no]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_no</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [no]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ny/index.html b/datasets/language_ny/index.html
new file mode 100644
index 0000000..cb5b509
--- /dev/null
+++ b/datasets/language_ny/index.html
@@ -0,0 +1,669 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ny/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Chichewa Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#chichewa-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Chichewa Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#wura-chichewa" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Chichewa]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="chichewa-datasets">Chichewa Datasets</h1>
+<p>There are in total 1 datasets with N/A tokens in Chichewa language.</p>
+<h2 id="wura-chichewa">WURA [Chichewa]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_ny</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Chichewa]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_oc/index.html b/datasets/language_oc/index.html
new file mode 100644
index 0000000..9e30958
--- /dev/null
+++ b/datasets/language_oc/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_oc/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Occitan Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#occitan-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Occitan Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-oc-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [oc; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-oc-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [oc; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-oc-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [oc; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-oc-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [oc; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-oc-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [oc; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-oc-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [oc; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-oc-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [oc; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-oc-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [oc; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-oc-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [oc; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-oc-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [oc; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-oc-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [oc; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-oc-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [oc; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="occitan-datasets">Occitan Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Occitan language.</p>
+<h2 id="colossal-oscar-1-oc-2015-14">Colossal OSCAR 1 [oc; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_oc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [oc; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-oc-2016-40">Colossal OSCAR 1 [oc; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_oc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [oc; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-oc-2017-43">Colossal OSCAR 1 [oc; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_oc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [oc; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-oc-2018-47">Colossal OSCAR 1 [oc; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_oc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [oc; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-oc-2019-22">Colossal OSCAR 1 [oc; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_oc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [oc; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-oc-2020-24">Colossal OSCAR 1 [oc; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_oc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [oc; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-oc-2020-45">Colossal OSCAR 1 [oc; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_oc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [oc; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-oc-2021-49">Colossal OSCAR 1 [oc; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_oc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [oc; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-oc-2022-27">Colossal OSCAR 1 [oc; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_oc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [oc; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-oc-2022-49">Colossal OSCAR 1 [oc; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_oc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [oc; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-oc-2023-14">Colossal OSCAR 1 [oc; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_oc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [oc; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-oc-2023-23">Colossal OSCAR 1 [oc; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_oc</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [oc; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_om/index.html b/datasets/language_om/index.html
new file mode 100644
index 0000000..e572b74
--- /dev/null
+++ b/datasets/language_om/index.html
@@ -0,0 +1,669 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_om/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Oromo Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#oromo-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Oromo Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#wura-oromo" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Oromo]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="oromo-datasets">Oromo Datasets</h1>
+<p>There are in total 1 datasets with N/A tokens in Oromo language.</p>
+<h2 id="wura-oromo">WURA [Oromo]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_om</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Oromo]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_or/index.html b/datasets/language_or/index.html
new file mode 100644
index 0000000..046b882
--- /dev/null
+++ b/datasets/language_or/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_or/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Oriya Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#oriya-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Oriya Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-or-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [or; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-or-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [or; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-or-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [or; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-or-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [or; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-or-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [or; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-or-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [or; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-or-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [or; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-or-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [or; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-or-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [or; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-or-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [or; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-or-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [or; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-or-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [or; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="oriya-datasets">Oriya Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Oriya language.</p>
+<h2 id="colossal-oscar-1-or-2015-14">Colossal OSCAR 1 [or; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_or</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [or; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-or-2016-40">Colossal OSCAR 1 [or; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_or</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [or; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-or-2017-43">Colossal OSCAR 1 [or; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_or</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [or; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-or-2018-47">Colossal OSCAR 1 [or; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_or</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [or; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-or-2019-22">Colossal OSCAR 1 [or; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_or</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [or; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-or-2020-24">Colossal OSCAR 1 [or; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_or</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [or; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-or-2020-45">Colossal OSCAR 1 [or; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_or</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [or; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-or-2021-49">Colossal OSCAR 1 [or; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_or</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [or; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-or-2022-27">Colossal OSCAR 1 [or; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_or</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [or; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-or-2022-49">Colossal OSCAR 1 [or; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_or</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [or; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-or-2023-14">Colossal OSCAR 1 [or; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_or</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [or; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-or-2023-23">Colossal OSCAR 1 [or; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_or</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [or; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_os/index.html b/datasets/language_os/index.html
new file mode 100644
index 0000000..0d4d10a
--- /dev/null
+++ b/datasets/language_os/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_os/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Ossetian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#ossetian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Ossetian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-os-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [os; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-os-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [os; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-os-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [os; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-os-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [os; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-os-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [os; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-os-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [os; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-os-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [os; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-os-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [os; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-os-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [os; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-os-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [os; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-os-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [os; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-os-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [os; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="ossetian-datasets">Ossetian Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Ossetian language.</p>
+<h2 id="colossal-oscar-1-os-2015-14">Colossal OSCAR 1 [os; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_os</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [os; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-os-2016-40">Colossal OSCAR 1 [os; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_os</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [os; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-os-2017-43">Colossal OSCAR 1 [os; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_os</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [os; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-os-2018-47">Colossal OSCAR 1 [os; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_os</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [os; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-os-2019-22">Colossal OSCAR 1 [os; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_os</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [os; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-os-2020-24">Colossal OSCAR 1 [os; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_os</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [os; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-os-2020-45">Colossal OSCAR 1 [os; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_os</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [os; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-os-2021-49">Colossal OSCAR 1 [os; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_os</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [os; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-os-2022-27">Colossal OSCAR 1 [os; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_os</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [os; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-os-2022-49">Colossal OSCAR 1 [os; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_os</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [os; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-os-2023-14">Colossal OSCAR 1 [os; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_os</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [os; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-os-2023-23">Colossal OSCAR 1 [os; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_os</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [os; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_pa/index.html b/datasets/language_pa/index.html
new file mode 100644
index 0000000..429a9a8
--- /dev/null
+++ b/datasets/language_pa/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_pa/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Panjabi Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#panjabi-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Panjabi Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pa-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pa; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pa-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pa; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pa-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pa; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pa-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pa; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pa-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pa; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pa-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pa; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pa-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pa; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pa-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pa; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pa-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pa; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pa-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pa; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pa-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pa; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pa-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pa; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="panjabi-datasets">Panjabi Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Panjabi language.</p>
+<h2 id="colossal-oscar-1-pa-2015-14">Colossal OSCAR 1 [pa; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_pa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pa; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pa-2016-40">Colossal OSCAR 1 [pa; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_pa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pa; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pa-2017-43">Colossal OSCAR 1 [pa; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_pa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pa; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pa-2018-47">Colossal OSCAR 1 [pa; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_pa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pa; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pa-2019-22">Colossal OSCAR 1 [pa; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_pa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pa; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pa-2020-24">Colossal OSCAR 1 [pa; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_pa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pa; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pa-2020-45">Colossal OSCAR 1 [pa; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_pa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pa; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pa-2021-49">Colossal OSCAR 1 [pa; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_pa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pa; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pa-2022-27">Colossal OSCAR 1 [pa; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_pa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pa; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pa-2022-49">Colossal OSCAR 1 [pa; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_pa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pa; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pa-2023-14">Colossal OSCAR 1 [pa; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_pa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pa; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pa-2023-23">Colossal OSCAR 1 [pa; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_pa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pa; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_pl/index.html b/datasets/language_pl/index.html
new file mode 100644
index 0000000..82ee999
--- /dev/null
+++ b/datasets/language_pl/index.html
@@ -0,0 +1,1633 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_pl/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Polish Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#polish-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Polish Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#curlicat-corpus-polish" class="md-nav__link">
+    <span class="md-ellipsis">
+      CURLICAT Corpus [Polish]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pl-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pl; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pl-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pl; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pl-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pl; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pl-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pl; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pl-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pl; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pl-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pl; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pl-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pl; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pl-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pl; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pl-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pl; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pl-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pl; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pl-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pl; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pl-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pl; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-pl" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [pl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-pl" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [pl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#nkjp-podkorpusmilionowy-12-national-corpus-of-polish" class="md-nav__link">
+    <span class="md-ellipsis">
+      NKJP-PodkorpusMilionowy-1.2 (National Corpus of Polish)
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#polish-parliamentary-corpus-korpus-dyskursu-parlamentarnego" class="md-nav__link">
+    <span class="md-ellipsis">
+      Polish Parliamentary Corpus / Korpus Dyskursu Parlamentarnego
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-pl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [pl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-pl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [pl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-pl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [pl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-pl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [pl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-pl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [pl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-pl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [pl]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="polish-datasets">Polish Datasets</h1>
+<p>There are in total 23 datasets with 25 B tokens in Polish language.</p>
+<h2 id="curlicat-corpus-polish">CURLICAT Corpus [Polish]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>curlicat_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>CURLICAT Corpus [Polish]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The CURLICAT corpus includes 7 monolingual corpora (Bulgarian, Croatian, Hungarian, Polish, Romanian, Slovak and Slovenian) containing selected samples from respective national corpora.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://elrc-share.eu/repository/browse/curlicat-polish-corpus/f63ae912553911ed9c1a00155d02670648c0a234e0314895b52169af2af57dd7/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC-BY-SA-4.0 (commercial use: None, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>59 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pl-2015-14">Colossal OSCAR 1 [pl; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pl; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pl-2016-40">Colossal OSCAR 1 [pl; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pl; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pl-2017-43">Colossal OSCAR 1 [pl; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pl; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pl-2018-47">Colossal OSCAR 1 [pl; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pl; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pl-2019-22">Colossal OSCAR 1 [pl; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pl; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pl-2020-24">Colossal OSCAR 1 [pl; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pl; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pl-2020-45">Colossal OSCAR 1 [pl; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pl; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pl-2021-49">Colossal OSCAR 1 [pl; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pl; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pl-2022-27">Colossal OSCAR 1 [pl; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pl; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pl-2022-49">Colossal OSCAR 1 [pl; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pl; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pl-2023-14">Colossal OSCAR 1 [pl; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pl; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pl-2023-23">Colossal OSCAR 1 [pl; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pl; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>18 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-pl">EurlexResources [pl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [pl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-pl">LegalMC4 [pl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [pl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="nkjp-podkorpusmilionowy-12-national-corpus-of-polish">NKJP-PodkorpusMilionowy-1.2 (National Corpus of Polish)</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pl_nkjp</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>NKJP-PodkorpusMilionowy-1.2 (National Corpus of Polish)</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A reference corpus of Polish language containing over fifteen hundred millions of words. The list of sources for the corpora contains classic literature, daily newspapers, specialist periodicals and journals, transcripts of conversations, and a variety of short-lived and internet texts.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[http://clip.ipipan.waw.pl/NationalCorpusOfPolish]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC-BY (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="polish-parliamentary-corpus-korpus-dyskursu-parlamentarnego">Polish Parliamentary Corpus / Korpus Dyskursu Parlamentarnego</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>pl_parliamentary_corpus</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Polish Parliamentary Corpus / Korpus Dyskursu Parlamentarnego</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The Polish Parliamentary Corpus (PPC) is a Polish corpus made up of documents from the proceedings of the Polish Parliament, Sejm, and Senate. The corpus includes data of the Polish Sejm corpus and consists of stenographic records of plenary sittings and committee sittings, segments of interpellations and questions. Texts in the PPC corpus cover the period of a hundred years from 1919 to 2019.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[http://clip.ipipan.waw.pl/PPC]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC-BY (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>671 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-pl">Wikibooks [pl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [pl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>10 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-pl">Wikinews [pl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [pl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>9 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-pl">Wikipedia [pl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [pl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>361 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-pl">Wikiquote [pl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [pl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>29 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-pl">Wikisource [pl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [pl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>19 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-pl">Wikivoyage [pl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_pl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [pl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>9 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_pms/index.html b/datasets/language_pms/index.html
new file mode 100644
index 0000000..8013938
--- /dev/null
+++ b/datasets/language_pms/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_pms/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Pms Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#pms-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Pms Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pms-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pms; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pms-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pms; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pms-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pms; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pms-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pms; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pms-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pms; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pms-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pms; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pms-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pms; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pms-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pms; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pms-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pms; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pms-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pms; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pms-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pms; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pms-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pms; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="pms-datasets">Pms Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Pms language.</p>
+<h2 id="colossal-oscar-1-pms-2015-14">Colossal OSCAR 1 [pms; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_pms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pms; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pms-2016-40">Colossal OSCAR 1 [pms; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_pms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pms; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pms-2017-43">Colossal OSCAR 1 [pms; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_pms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pms; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pms-2018-47">Colossal OSCAR 1 [pms; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_pms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pms; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pms-2019-22">Colossal OSCAR 1 [pms; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_pms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pms; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pms-2020-24">Colossal OSCAR 1 [pms; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_pms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pms; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pms-2020-45">Colossal OSCAR 1 [pms; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_pms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pms; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pms-2021-49">Colossal OSCAR 1 [pms; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_pms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pms; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pms-2022-27">Colossal OSCAR 1 [pms; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_pms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pms; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pms-2022-49">Colossal OSCAR 1 [pms; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_pms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pms; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pms-2023-14">Colossal OSCAR 1 [pms; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_pms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pms; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pms-2023-23">Colossal OSCAR 1 [pms; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_pms</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pms; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_pnb/index.html b/datasets/language_pnb/index.html
new file mode 100644
index 0000000..900d78a
--- /dev/null
+++ b/datasets/language_pnb/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_pnb/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Pnb Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#pnb-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Pnb Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pnb-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pnb; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pnb-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pnb; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pnb-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pnb; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pnb-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pnb; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pnb-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pnb; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pnb-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pnb; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pnb-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pnb; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pnb-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pnb; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pnb-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pnb; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pnb-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pnb; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pnb-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pnb; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pnb-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pnb; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="pnb-datasets">Pnb Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Pnb language.</p>
+<h2 id="colossal-oscar-1-pnb-2015-14">Colossal OSCAR 1 [pnb; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_pnb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pnb; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pnb-2016-40">Colossal OSCAR 1 [pnb; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_pnb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pnb; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pnb-2017-43">Colossal OSCAR 1 [pnb; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_pnb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pnb; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pnb-2018-47">Colossal OSCAR 1 [pnb; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_pnb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pnb; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pnb-2019-22">Colossal OSCAR 1 [pnb; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_pnb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pnb; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pnb-2020-24">Colossal OSCAR 1 [pnb; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_pnb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pnb; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pnb-2020-45">Colossal OSCAR 1 [pnb; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_pnb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pnb; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pnb-2021-49">Colossal OSCAR 1 [pnb; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_pnb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pnb; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pnb-2022-27">Colossal OSCAR 1 [pnb; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_pnb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pnb; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pnb-2022-49">Colossal OSCAR 1 [pnb; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_pnb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pnb; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pnb-2023-14">Colossal OSCAR 1 [pnb; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_pnb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pnb; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pnb-2023-23">Colossal OSCAR 1 [pnb; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_pnb</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pnb; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ps/index.html b/datasets/language_ps/index.html
new file mode 100644
index 0000000..0c74693
--- /dev/null
+++ b/datasets/language_ps/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ps/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Pashto Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#pashto-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Pashto Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ps-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ps; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ps-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ps; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ps-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ps; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ps-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ps; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ps-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ps; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ps-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ps; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ps-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ps; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ps-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ps; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ps-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ps; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ps-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ps; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ps-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ps; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ps-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ps; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="pashto-datasets">Pashto Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Pashto language.</p>
+<h2 id="colossal-oscar-1-ps-2015-14">Colossal OSCAR 1 [ps; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ps</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ps; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ps-2016-40">Colossal OSCAR 1 [ps; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ps</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ps; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ps-2017-43">Colossal OSCAR 1 [ps; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ps</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ps; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ps-2018-47">Colossal OSCAR 1 [ps; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ps</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ps; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ps-2019-22">Colossal OSCAR 1 [ps; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ps</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ps; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ps-2020-24">Colossal OSCAR 1 [ps; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ps</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ps; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ps-2020-45">Colossal OSCAR 1 [ps; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ps</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ps; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ps-2021-49">Colossal OSCAR 1 [ps; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ps</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ps; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ps-2022-27">Colossal OSCAR 1 [ps; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ps</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ps; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ps-2022-49">Colossal OSCAR 1 [ps; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ps</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ps; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ps-2023-14">Colossal OSCAR 1 [ps; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ps</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ps; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ps-2023-23">Colossal OSCAR 1 [ps; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ps</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ps; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_pt/index.html b/datasets/language_pt/index.html
new file mode 100644
index 0000000..dbe80fb
--- /dev/null
+++ b/datasets/language_pt/index.html
@@ -0,0 +1,1653 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_pt/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Portuguese Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#portuguese-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Portuguese Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#brazilian-portuguese-web-as-corpus" class="md-nav__link">
+    <span class="md-ellipsis">
+      Brazilian Portuguese Web as Corpus
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pt-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pt; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pt-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pt; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pt-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pt; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pt-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pt; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pt-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pt; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pt-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pt; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pt-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pt; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pt-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pt; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pt-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pt; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pt-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pt; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pt-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pt; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-pt-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [pt; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-pt" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [pt]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-pt" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [pt]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#parlamentopt" class="md-nav__link">
+    <span class="md-ellipsis">
+      ParlamentoPT
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wura-portuguese" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Portuguese]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-pt" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [pt]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-pt" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [pt]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-pt" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [pt]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-pt" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [pt]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-pt" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [pt]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-pt" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [pt]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="portuguese-datasets">Portuguese Datasets</h1>
+<p>There are in total 23 datasets with 24 B tokens in Portuguese language.</p>
+<h2 id="brazilian-portuguese-web-as-corpus">Brazilian Portuguese Web as Corpus</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>brwac</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Brazilian Portuguese Web as Corpus</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The BrWaC (Brazilian Portuguese Web as Corpus) is a large corpus constructed followingthe Wacky framework, which was made public for research purposes.The current corpus version, released in January 2017, is composed by 3.53 million documents,2.68 billion tokens and 5.79 million types. Please note that this resource is availablesolely for academic research purposes, and you agreed not to use it for any commercial applications.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>on_request</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/brwac]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>research-only (commercial use: False, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pt-2015-14">Colossal OSCAR 1 [pt; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pt; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pt-2016-40">Colossal OSCAR 1 [pt; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pt; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pt-2017-43">Colossal OSCAR 1 [pt; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pt; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pt-2018-47">Colossal OSCAR 1 [pt; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pt; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pt-2019-22">Colossal OSCAR 1 [pt; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pt; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pt-2020-24">Colossal OSCAR 1 [pt; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pt; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pt-2020-45">Colossal OSCAR 1 [pt; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pt; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pt-2021-49">Colossal OSCAR 1 [pt; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pt; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pt-2022-27">Colossal OSCAR 1 [pt; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pt; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pt-2022-49">Colossal OSCAR 1 [pt; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pt; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pt-2023-14">Colossal OSCAR 1 [pt; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pt; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-pt-2023-23">Colossal OSCAR 1 [pt; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [pt; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>12 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-pt">EurlexResources [pt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [pt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>7 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-pt">LegalMC4 [pt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [pt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="parlamentopt">ParlamentoPT</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>parlamento_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>ParlamentoPT</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td></td>
+</tr>
+<tr>
+<td>The ParlamentoPT is a Portuguese language data set obtained by collecting publicly</td>
+<td></td>
+</tr>
+<tr>
+<td>available documents containing transcriptions of debates in the Portuguese Parliament. The data was collected from</td>
+<td></td>
+</tr>
+<tr>
+<td>the Portuguese Parliament portal in accordance with its open data policy.</td>
+<td></td>
+</tr>
+<tr>
+<td></td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/PORTULAN/parlamento-pt]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>open data (Portuguese Parliament portal policy) (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>819 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wura-portuguese">WURA [Portuguese]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Portuguese]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-pt">Wikibooks [pt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [pt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>13 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-pt">Wikinews [pt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [pt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>16 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-pt">Wikipedia [pt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [pt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>466 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-pt">Wikiquote [pt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [pt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>7 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-pt">Wikisource [pt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [pt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>35 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-pt">Wikivoyage [pt]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_pt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [pt]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_qu/index.html b/datasets/language_qu/index.html
new file mode 100644
index 0000000..91fcfb7
--- /dev/null
+++ b/datasets/language_qu/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_qu/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Quechua Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#quechua-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Quechua Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-qu-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [qu; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-qu-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [qu; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-qu-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [qu; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-qu-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [qu; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-qu-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [qu; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-qu-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [qu; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-qu-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [qu; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-qu-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [qu; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-qu-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [qu; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-qu-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [qu; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-qu-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [qu; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-qu-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [qu; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="quechua-datasets">Quechua Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Quechua language.</p>
+<h2 id="colossal-oscar-1-qu-2015-14">Colossal OSCAR 1 [qu; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_qu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [qu; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-qu-2016-40">Colossal OSCAR 1 [qu; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_qu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [qu; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-qu-2017-43">Colossal OSCAR 1 [qu; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_qu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [qu; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-qu-2018-47">Colossal OSCAR 1 [qu; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_qu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [qu; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-qu-2019-22">Colossal OSCAR 1 [qu; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_qu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [qu; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-qu-2020-24">Colossal OSCAR 1 [qu; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_qu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [qu; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-qu-2020-45">Colossal OSCAR 1 [qu; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_qu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [qu; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-qu-2021-49">Colossal OSCAR 1 [qu; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_qu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [qu; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-qu-2022-27">Colossal OSCAR 1 [qu; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_qu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [qu; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-qu-2022-49">Colossal OSCAR 1 [qu; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_qu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [qu; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-qu-2023-14">Colossal OSCAR 1 [qu; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_qu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [qu; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-qu-2023-23">Colossal OSCAR 1 [qu; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_qu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [qu; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ro/index.html b/datasets/language_ro/index.html
new file mode 100644
index 0000000..e474f86
--- /dev/null
+++ b/datasets/language_ro/index.html
@@ -0,0 +1,1589 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ro/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Romanian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#romanian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Romanian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#curlicat-corpus-romanian" class="md-nav__link">
+    <span class="md-ellipsis">
+      CURLICAT Corpus [Romanian]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ro-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ro; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ro-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ro; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ro-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ro; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ro-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ro; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ro-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ro; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ro-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ro; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ro-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ro; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ro-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ro; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ro-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ro; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ro-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ro; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ro-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ro; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ro-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ro; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-ro" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [ro]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-ro" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [ro]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#marcell-romanian-legislative-subcorpus-v2" class="md-nav__link">
+    <span class="md-ellipsis">
+      MARCELL Romanian legislative subcorpus v2
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-ro" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [ro]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-ro" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [ro]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-ro" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [ro]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-ro" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [ro]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-ro" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [ro]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-ro" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [ro]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="romanian-datasets">Romanian Datasets</h1>
+<p>There are in total 22 datasets with 9 B tokens in Romanian language.</p>
+<h2 id="curlicat-corpus-romanian">CURLICAT Corpus [Romanian]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>curlicat_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>CURLICAT Corpus [Romanian]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The CURLICAT corpus includes 7 monolingual corpora (Bulgarian, Croatian, Hungarian, Polish, Romanian, Slovak and Slovenian) containing selected samples from respective national corpora.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://elrc-share.eu/repository/browse/curlicat-romanian-corpus/8b6c8dca58ea11ed9c1a00155d026706fb03ef8b4c1847cfbe9cea869a82731e/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC-BY-SA-4.0 (commercial use: None, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>95 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ro-2015-14">Colossal OSCAR 1 [ro; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ro; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ro-2016-40">Colossal OSCAR 1 [ro; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ro; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ro-2017-43">Colossal OSCAR 1 [ro; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ro; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ro-2018-47">Colossal OSCAR 1 [ro; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ro; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ro-2019-22">Colossal OSCAR 1 [ro; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ro; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ro-2020-24">Colossal OSCAR 1 [ro; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ro; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ro-2020-45">Colossal OSCAR 1 [ro; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ro; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ro-2021-49">Colossal OSCAR 1 [ro; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ro; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ro-2022-27">Colossal OSCAR 1 [ro; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ro; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ro-2022-49">Colossal OSCAR 1 [ro; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ro; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ro-2023-14">Colossal OSCAR 1 [ro; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ro; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ro-2023-23">Colossal OSCAR 1 [ro; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ro; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-ro">EurlexResources [ro]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [ro]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-ro">LegalMC4 [ro]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [ro]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>551 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="marcell-romanian-legislative-subcorpus-v2">MARCELL Romanian legislative subcorpus v2</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>marcell_legislative_subcorpus_v2</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>MARCELL Romanian legislative subcorpus v2</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The Romanian corpus contains 163,274 files, which represent the body of national legislation ranging from 1881 to 2021. This corpus includes mainly: governmental decisions, ministerial orders, decisions, decrees and laws. All the texts were obtained via crawling from the public Romanian legislative portal. This corpus resulted from the MARCELL project. Alternate download location: https://relate.racai.ro/marcell/new/</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://elrc-share.eu/repository/browse/marcell-romanian-legislative-subcorpus-v2/2da548428b9d11eb9c1a00155d026706ce94a6b59ffc4b0e9fb5cd9cebe6889e/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>public domain</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>31 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-ro">Wikibooks [ro]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [ro]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-ro">Wikinews [ro]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [ro]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>744 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-ro">Wikipedia [ro]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [ro]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>152 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-ro">Wikiquote [ro]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [ro]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>436 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-ro">Wikisource [ro]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [ro]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>49 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-ro">Wikivoyage [ro]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_ro</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [ro]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>507 k</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ru/index.html b/datasets/language_ru/index.html
new file mode 100644
index 0000000..7b955f6
--- /dev/null
+++ b/datasets/language_ru/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ru/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Russian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#russian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Russian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ru-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ru; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ru-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ru; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ru-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ru; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ru-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ru; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ru-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ru; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ru-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ru; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ru-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ru; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ru-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ru; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ru-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ru; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ru-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ru; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ru-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ru; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ru-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ru; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="russian-datasets">Russian Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Russian language.</p>
+<h2 id="colossal-oscar-1-ru-2015-14">Colossal OSCAR 1 [ru; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ru</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ru; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ru-2016-40">Colossal OSCAR 1 [ru; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ru</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ru; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ru-2017-43">Colossal OSCAR 1 [ru; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ru</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ru; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ru-2018-47">Colossal OSCAR 1 [ru; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ru</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ru; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ru-2019-22">Colossal OSCAR 1 [ru; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ru</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ru; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ru-2020-24">Colossal OSCAR 1 [ru; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ru</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ru; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ru-2020-45">Colossal OSCAR 1 [ru; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ru</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ru; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ru-2021-49">Colossal OSCAR 1 [ru; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ru</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ru; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ru-2022-27">Colossal OSCAR 1 [ru; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ru</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ru; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ru-2022-49">Colossal OSCAR 1 [ru; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ru</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ru; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ru-2023-14">Colossal OSCAR 1 [ru; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ru</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ru; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ru-2023-23">Colossal OSCAR 1 [ru; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ru</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ru; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_rw/index.html b/datasets/language_rw/index.html
new file mode 100644
index 0000000..4eba59a
--- /dev/null
+++ b/datasets/language_rw/index.html
@@ -0,0 +1,669 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_rw/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Kinyarwanda Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#kinyarwanda-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Kinyarwanda Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#wura-kinyarwanda" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Kinyarwanda]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="kinyarwanda-datasets">Kinyarwanda Datasets</h1>
+<p>There are in total 1 datasets with N/A tokens in Kinyarwanda language.</p>
+<h2 id="wura-kinyarwanda">WURA [Kinyarwanda]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_rw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Kinyarwanda]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_sa/index.html b/datasets/language_sa/index.html
new file mode 100644
index 0000000..61f31bd
--- /dev/null
+++ b/datasets/language_sa/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_sa/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Sanskrit Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#sanskrit-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Sanskrit Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sa-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sa; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sa-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sa; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sa-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sa; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sa-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sa; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sa-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sa; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sa-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sa; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sa-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sa; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sa-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sa; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sa-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sa; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sa-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sa; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sa-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sa; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sa-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sa; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="sanskrit-datasets">Sanskrit Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Sanskrit language.</p>
+<h2 id="colossal-oscar-1-sa-2015-14">Colossal OSCAR 1 [sa; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_sa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sa; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sa-2016-40">Colossal OSCAR 1 [sa; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_sa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sa; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sa-2017-43">Colossal OSCAR 1 [sa; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_sa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sa; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sa-2018-47">Colossal OSCAR 1 [sa; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_sa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sa; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sa-2019-22">Colossal OSCAR 1 [sa; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_sa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sa; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sa-2020-24">Colossal OSCAR 1 [sa; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_sa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sa; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sa-2020-45">Colossal OSCAR 1 [sa; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_sa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sa; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sa-2021-49">Colossal OSCAR 1 [sa; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_sa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sa; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sa-2022-27">Colossal OSCAR 1 [sa; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_sa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sa; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sa-2022-49">Colossal OSCAR 1 [sa; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_sa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sa; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sa-2023-14">Colossal OSCAR 1 [sa; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_sa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sa; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sa-2023-23">Colossal OSCAR 1 [sa; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_sa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sa; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_sah/index.html b/datasets/language_sah/index.html
new file mode 100644
index 0000000..40bc549
--- /dev/null
+++ b/datasets/language_sah/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_sah/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Sah Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#sah-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Sah Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sah-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sah; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sah-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sah; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sah-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sah; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sah-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sah; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sah-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sah; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sah-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sah; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sah-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sah; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sah-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sah; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sah-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sah; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sah-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sah; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sah-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sah; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sah-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sah; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="sah-datasets">Sah Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Sah language.</p>
+<h2 id="colossal-oscar-1-sah-2015-14">Colossal OSCAR 1 [sah; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_sah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sah; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sah-2016-40">Colossal OSCAR 1 [sah; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_sah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sah; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sah-2017-43">Colossal OSCAR 1 [sah; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_sah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sah; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sah-2018-47">Colossal OSCAR 1 [sah; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_sah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sah; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sah-2019-22">Colossal OSCAR 1 [sah; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_sah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sah; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sah-2020-24">Colossal OSCAR 1 [sah; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_sah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sah; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sah-2020-45">Colossal OSCAR 1 [sah; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_sah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sah; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sah-2021-49">Colossal OSCAR 1 [sah; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_sah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sah; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sah-2022-27">Colossal OSCAR 1 [sah; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_sah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sah; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sah-2022-49">Colossal OSCAR 1 [sah; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_sah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sah; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sah-2023-14">Colossal OSCAR 1 [sah; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_sah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sah; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sah-2023-23">Colossal OSCAR 1 [sah; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_sah</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sah; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_sd/index.html b/datasets/language_sd/index.html
new file mode 100644
index 0000000..f495e95
--- /dev/null
+++ b/datasets/language_sd/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_sd/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Sindhi Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#sindhi-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Sindhi Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sd-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sd; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sd-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sd; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sd-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sd; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sd-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sd; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sd-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sd; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sd-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sd; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sd-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sd; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sd-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sd; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sd-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sd; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sd-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sd; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sd-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sd; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sd-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sd; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="sindhi-datasets">Sindhi Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Sindhi language.</p>
+<h2 id="colossal-oscar-1-sd-2015-14">Colossal OSCAR 1 [sd; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_sd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sd; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sd-2016-40">Colossal OSCAR 1 [sd; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_sd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sd; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sd-2017-43">Colossal OSCAR 1 [sd; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_sd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sd; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sd-2018-47">Colossal OSCAR 1 [sd; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_sd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sd; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sd-2019-22">Colossal OSCAR 1 [sd; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_sd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sd; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sd-2020-24">Colossal OSCAR 1 [sd; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_sd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sd; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sd-2020-45">Colossal OSCAR 1 [sd; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_sd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sd; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sd-2021-49">Colossal OSCAR 1 [sd; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_sd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sd; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sd-2022-27">Colossal OSCAR 1 [sd; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_sd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sd; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sd-2022-49">Colossal OSCAR 1 [sd; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_sd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sd; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sd-2023-14">Colossal OSCAR 1 [sd; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_sd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sd; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sd-2023-23">Colossal OSCAR 1 [sd; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_sd</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sd; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_sh/index.html b/datasets/language_sh/index.html
new file mode 100644
index 0000000..ac9f7e4
--- /dev/null
+++ b/datasets/language_sh/index.html
@@ -0,0 +1,1193 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_sh/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Serbo-Croatian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#serbo-croatian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Serbo-Croatian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sh-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sh; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sh-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sh; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sh-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sh; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sh-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sh; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sh-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sh; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sh-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sh; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sh-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sh; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sh-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sh; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sh-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sh; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sh-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sh; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sh-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sh; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sh-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sh; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-sh" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [sh]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="serbo-croatian-datasets">Serbo-Croatian Datasets</h1>
+<p>There are in total 13 datasets with 58 k tokens in Serbo-Croatian language.</p>
+<h2 id="colossal-oscar-1-sh-2015-14">Colossal OSCAR 1 [sh; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_sh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sh; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sh-2016-40">Colossal OSCAR 1 [sh; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_sh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sh; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sh-2017-43">Colossal OSCAR 1 [sh; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_sh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sh; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sh-2018-47">Colossal OSCAR 1 [sh; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_sh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sh; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sh-2019-22">Colossal OSCAR 1 [sh; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_sh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sh; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sh-2020-24">Colossal OSCAR 1 [sh; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_sh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sh; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sh-2020-45">Colossal OSCAR 1 [sh; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_sh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sh; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sh-2021-49">Colossal OSCAR 1 [sh; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_sh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sh; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sh-2022-27">Colossal OSCAR 1 [sh; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_sh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sh; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sh-2022-49">Colossal OSCAR 1 [sh; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_sh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sh; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sh-2023-14">Colossal OSCAR 1 [sh; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_sh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sh; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sh-2023-23">Colossal OSCAR 1 [sh; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_sh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sh; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>58 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-sh">Wikipedia [sh]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_sh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [sh]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_si/index.html b/datasets/language_si/index.html
new file mode 100644
index 0000000..f4ad5c7
--- /dev/null
+++ b/datasets/language_si/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_si/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Sinhalese Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#sinhalese-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Sinhalese Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-si-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [si; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-si-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [si; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-si-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [si; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-si-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [si; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-si-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [si; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-si-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [si; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-si-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [si; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-si-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [si; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-si-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [si; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-si-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [si; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-si-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [si; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-si-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [si; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="sinhalese-datasets">Sinhalese Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Sinhalese language.</p>
+<h2 id="colossal-oscar-1-si-2015-14">Colossal OSCAR 1 [si; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_si</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [si; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-si-2016-40">Colossal OSCAR 1 [si; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_si</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [si; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-si-2017-43">Colossal OSCAR 1 [si; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_si</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [si; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-si-2018-47">Colossal OSCAR 1 [si; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_si</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [si; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-si-2019-22">Colossal OSCAR 1 [si; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_si</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [si; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-si-2020-24">Colossal OSCAR 1 [si; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_si</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [si; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-si-2020-45">Colossal OSCAR 1 [si; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_si</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [si; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-si-2021-49">Colossal OSCAR 1 [si; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_si</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [si; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-si-2022-27">Colossal OSCAR 1 [si; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_si</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [si; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-si-2022-49">Colossal OSCAR 1 [si; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_si</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [si; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-si-2023-14">Colossal OSCAR 1 [si; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_si</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [si; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-si-2023-23">Colossal OSCAR 1 [si; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_si</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [si; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_sk/index.html b/datasets/language_sk/index.html
new file mode 100644
index 0000000..29ab597
--- /dev/null
+++ b/datasets/language_sk/index.html
@@ -0,0 +1,1545 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_sk/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Slovak Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#slovak-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Slovak Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#curlicat-corpus-slovak-3rd-version" class="md-nav__link">
+    <span class="md-ellipsis">
+      CURLICAT Corpus [Slovak 3rd version]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sk-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sk; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sk-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sk; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sk-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sk; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sk-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sk; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sk-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sk; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sk-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sk; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sk-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sk; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sk-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sk; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sk-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sk; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sk-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sk; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sk-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sk; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sk-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sk; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-sk" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [sk]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#korpus-slovenskych-pravnych-predpisov-v19" class="md-nav__link">
+    <span class="md-ellipsis">
+      Korpus slovenských právnych predpisov v1.9
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-sk" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [sk]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-sk" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [sk]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-sk" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [sk]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-sk" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [sk]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-sk" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [sk]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#od-justice-20" class="md-nav__link">
+    <span class="md-ellipsis">
+      od-justice 2.0
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="slovak-datasets">Slovak Datasets</h1>
+<p>There are in total 21 datasets with 18 B tokens in Slovak language.</p>
+<h2 id="curlicat-corpus-slovak-3rd-version">CURLICAT Corpus [Slovak 3rd version]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>curlicat_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>CURLICAT Corpus [Slovak 3rd version]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The CURLICAT corpus includes 7 monolingual corpora (Bulgarian, Croatian, Hungarian, Polish, Romanian, Slovak and Slovenian) containing selected samples from respective national corpora.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://elrc-share.eu/repository/browse/curlicat-slovak-corpus-v10/b419d7086ef411ed9c1a00155d0267066a930aa487824c08ba48f1183e993aca/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>unknown (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>67 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sk-2015-14">Colossal OSCAR 1 [sk; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sk; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sk-2016-40">Colossal OSCAR 1 [sk; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sk; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sk-2017-43">Colossal OSCAR 1 [sk; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sk; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sk-2018-47">Colossal OSCAR 1 [sk; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sk; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sk-2019-22">Colossal OSCAR 1 [sk; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sk; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sk-2020-24">Colossal OSCAR 1 [sk; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sk; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sk-2020-45">Colossal OSCAR 1 [sk; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sk; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sk-2021-49">Colossal OSCAR 1 [sk; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sk; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sk-2022-27">Colossal OSCAR 1 [sk; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sk; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sk-2022-49">Colossal OSCAR 1 [sk; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sk; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sk-2023-14">Colossal OSCAR 1 [sk; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sk; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sk-2023-23">Colossal OSCAR 1 [sk; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sk; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-sk">EurlexResources [sk]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [sk]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="korpus-slovenskych-pravnych-predpisov-v19">Korpus slovenských právnych predpisov v1.9</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>sk_laws</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Korpus slovenských právnych predpisov v1.9</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Slovak body of laws (1955-2022)</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>Yes - it has a direct download link or links</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[None]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>public domain (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>45 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-sk">LegalMC4 [sk]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [sk]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>349 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-sk">Wikibooks [sk]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [sk]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-sk">Wikipedia [sk]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [sk]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>64 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-sk">Wikiquote [sk]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [sk]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-sk">Wikisource [sk]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_sk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [sk]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="od-justice-20">od-justice 2.0</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>sk_court_decisions</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>od-justice 2.0</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Slovak court decisions. The corpus is based on data made available by the Ministry of Justice of the Slovak Republic.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://www.juls.savba.sk/justicecorp.html]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>open data (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>11 B</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_sl/index.html b/datasets/language_sl/index.html
new file mode 100644
index 0000000..765b27b
--- /dev/null
+++ b/datasets/language_sl/index.html
@@ -0,0 +1,1633 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_sl/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Slovene Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#slovene-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Slovene Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#curlicat-corpus-slovenian" class="md-nav__link">
+    <span class="md-ellipsis">
+      CURLICAT Corpus [Slovenian]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sl-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sl; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sl-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sl; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sl-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sl; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sl-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sl; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sl-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sl; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sl-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sl; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sl-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sl; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sl-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sl; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sl-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sl; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sl-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sl; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sl-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sl; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sl-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sl; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#corpus-of-academic-slovene-kas-20" class="md-nav__link">
+    <span class="md-ellipsis">
+      Corpus of academic Slovene KAS 2.0
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-sl" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [sl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-sl" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [sl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#macocu-web-corpus-slovene-20" class="md-nav__link">
+    <span class="md-ellipsis">
+      MaCoCu web corpus [Slovene 2.0]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-sl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [sl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-sl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [sl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-sl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [sl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-sl" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [sl]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#written-corpus-ccgigafida-10" class="md-nav__link">
+    <span class="md-ellipsis">
+      Written corpus ccGigafida 1.0
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#slwac-web-corpus" class="md-nav__link">
+    <span class="md-ellipsis">
+      slWaC web corpus
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="slovene-datasets">Slovene Datasets</h1>
+<p>There are in total 23 datasets with 9 B tokens in Slovene language.</p>
+<h2 id="curlicat-corpus-slovenian">CURLICAT Corpus [Slovenian]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>curlicat_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>CURLICAT Corpus [Slovenian]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The CURLICAT corpus includes 7 monolingual corpora (Bulgarian, Croatian, Hungarian, Polish, Romanian, Slovak and Slovenian) containing selected samples from respective national corpora.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://elrc-share.eu/repository/browse/curlicat-slovenian-corpus/e549f298590711ed9c1a00155d026706db0d61d46f294d9a821307cf9c5df245/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC-BY-SA-4.0 (commercial use: None, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>43 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sl-2015-14">Colossal OSCAR 1 [sl; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sl; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sl-2016-40">Colossal OSCAR 1 [sl; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sl; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sl-2017-43">Colossal OSCAR 1 [sl; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sl; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sl-2018-47">Colossal OSCAR 1 [sl; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sl; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sl-2019-22">Colossal OSCAR 1 [sl; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sl; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sl-2020-24">Colossal OSCAR 1 [sl; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sl; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sl-2020-45">Colossal OSCAR 1 [sl; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sl; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sl-2021-49">Colossal OSCAR 1 [sl; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sl; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sl-2022-27">Colossal OSCAR 1 [sl; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sl; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sl-2022-49">Colossal OSCAR 1 [sl; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sl; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sl-2023-14">Colossal OSCAR 1 [sl; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sl; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sl-2023-23">Colossal OSCAR 1 [sl; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sl; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>181 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="corpus-of-academic-slovene-kas-20">Corpus of academic Slovene KAS 2.0</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>academic_slovene_kas</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Corpus of academic Slovene KAS 2.0</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The KAS corpus of Slovene academic writing consists of almost 65,000 BSc/BA, 16,000 MSc/MA and 1,600 PhD theses (82 thousand texts, 5 million pages or 1,5 billion tokens) written 2000 - 2018 and gathered from the digital libraries of Slovene higher education institutions via the Slovene Open Science portal (http://openscience.si/).</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://www.clarin.si/repository/xmlui/handle/11356/1448]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CLARIN.SI Licence ACA ID-BY-NC-INF-NORED 1.0 (commercial use: False, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-sl">EurlexResources [sl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [sl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-sl">LegalMC4 [sl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [sl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>107 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="macocu-web-corpus-slovene-20">MaCoCu web corpus [Slovene 2.0]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>macocu_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>MaCoCu web corpus [Slovene 2.0]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://www.clarin.si/repository/xmlui/handle/11356/1795]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC0-No Rights Reserved (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-sl">Wikibooks [sl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [sl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-sl">Wikipedia [sl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [sl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>77 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-sl">Wikiquote [sl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [sl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>669 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-sl">Wikisource [sl]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_sl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [sl]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>118 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="written-corpus-ccgigafida-10">Written corpus ccGigafida 1.0</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>cc_gigafida</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Written corpus ccGigafida 1.0</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Corpus ccGigafida consists of paragraph samples from 31,722 documents, each containing information about the source (e.g. newspapers, magazines), year of publication, text type (fiction, newspaper), the title and author if they are known.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://www.clarin.si/repository/xmlui/handle/11356/1035]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons - Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0) (commercial use: False, sharealike: True)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>127 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="slwac-web-corpus">slWaC web corpus</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>slwac_web</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>slWaC web corpus</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>slWaC is a web corpus collected from the .si top-level domain in 2011 and 2014. The corpus is tokenized and annotated with the lemma and the morphosyntax layer.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[http://nlp.ffzg.hr/resources/corpora/slwac/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>open license</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 B</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_sn/index.html b/datasets/language_sn/index.html
new file mode 100644
index 0000000..cc2b523
--- /dev/null
+++ b/datasets/language_sn/index.html
@@ -0,0 +1,669 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_sn/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Shona Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#shona-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Shona Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#wura-shona" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Shona]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="shona-datasets">Shona Datasets</h1>
+<p>There are in total 1 datasets with N/A tokens in Shona language.</p>
+<h2 id="wura-shona">WURA [Shona]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_sn</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Shona]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_so/index.html b/datasets/language_so/index.html
new file mode 100644
index 0000000..cc1fd71
--- /dev/null
+++ b/datasets/language_so/index.html
@@ -0,0 +1,1197 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_so/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Somali Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#somali-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Somali Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-so-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [so; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-so-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [so; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-so-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [so; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-so-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [so; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-so-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [so; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-so-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [so; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-so-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [so; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-so-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [so; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-so-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [so; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-so-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [so; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-so-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [so; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-so-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [so; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wura-somali" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Somali]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="somali-datasets">Somali Datasets</h1>
+<p>There are in total 13 datasets with N/A tokens in Somali language.</p>
+<h2 id="colossal-oscar-1-so-2015-14">Colossal OSCAR 1 [so; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_so</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [so; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-so-2016-40">Colossal OSCAR 1 [so; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_so</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [so; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-so-2017-43">Colossal OSCAR 1 [so; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_so</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [so; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-so-2018-47">Colossal OSCAR 1 [so; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_so</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [so; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-so-2019-22">Colossal OSCAR 1 [so; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_so</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [so; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-so-2020-24">Colossal OSCAR 1 [so; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_so</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [so; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-so-2020-45">Colossal OSCAR 1 [so; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_so</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [so; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-so-2021-49">Colossal OSCAR 1 [so; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_so</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [so; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-so-2022-27">Colossal OSCAR 1 [so; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_so</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [so; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-so-2022-49">Colossal OSCAR 1 [so; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_so</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [so; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-so-2023-14">Colossal OSCAR 1 [so; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_so</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [so; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-so-2023-23">Colossal OSCAR 1 [so; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_so</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [so; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wura-somali">WURA [Somali]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_so</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Somali]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_sq/index.html b/datasets/language_sq/index.html
new file mode 100644
index 0000000..1accdc0
--- /dev/null
+++ b/datasets/language_sq/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_sq/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Albanian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#albanian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Albanian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sq-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sq; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sq-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sq; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sq-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sq; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sq-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sq; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sq-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sq; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sq-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sq; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sq-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sq; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sq-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sq; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sq-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sq; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sq-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sq; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sq-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sq; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sq-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sq; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="albanian-datasets">Albanian Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Albanian language.</p>
+<h2 id="colossal-oscar-1-sq-2015-14">Colossal OSCAR 1 [sq; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_sq</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sq; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sq-2016-40">Colossal OSCAR 1 [sq; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_sq</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sq; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sq-2017-43">Colossal OSCAR 1 [sq; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_sq</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sq; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sq-2018-47">Colossal OSCAR 1 [sq; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_sq</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sq; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sq-2019-22">Colossal OSCAR 1 [sq; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_sq</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sq; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sq-2020-24">Colossal OSCAR 1 [sq; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_sq</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sq; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sq-2020-45">Colossal OSCAR 1 [sq; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_sq</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sq; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sq-2021-49">Colossal OSCAR 1 [sq; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_sq</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sq; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sq-2022-27">Colossal OSCAR 1 [sq; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_sq</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sq; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sq-2022-49">Colossal OSCAR 1 [sq; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_sq</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sq; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sq-2023-14">Colossal OSCAR 1 [sq; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_sq</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sq; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sq-2023-23">Colossal OSCAR 1 [sq; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_sq</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sq; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_sr/index.html b/datasets/language_sr/index.html
new file mode 100644
index 0000000..65cd3f3
--- /dev/null
+++ b/datasets/language_sr/index.html
@@ -0,0 +1,1457 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_sr/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Serbian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#serbian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Serbian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sr-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sr; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sr-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sr; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sr-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sr; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sr-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sr; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sr-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sr; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sr-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sr; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sr-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sr; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sr-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sr; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sr-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sr; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sr-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sr; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sr-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sr; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sr-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sr; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#macocu-web-corpus-serbian-10" class="md-nav__link">
+    <span class="md-ellipsis">
+      MaCoCu web corpus [Serbian 1.0]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#srpkorsubset-news-legal-academic-conversation-literary" class="md-nav__link">
+    <span class="md-ellipsis">
+      SrpKorSubset (news, legal, academic, conversation, literary)
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-sr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [sr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-sr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [sr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-sr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [sr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-sr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [sr]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-sr" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [sr]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="serbian-datasets">Serbian Datasets</h1>
+<p>There are in total 19 datasets with 3 B tokens in Serbian language.</p>
+<h2 id="colossal-oscar-1-sr-2015-14">Colossal OSCAR 1 [sr; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sr; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sr-2016-40">Colossal OSCAR 1 [sr; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sr; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sr-2017-43">Colossal OSCAR 1 [sr; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sr; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sr-2018-47">Colossal OSCAR 1 [sr; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sr; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sr-2019-22">Colossal OSCAR 1 [sr; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sr; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sr-2020-24">Colossal OSCAR 1 [sr; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sr; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sr-2020-45">Colossal OSCAR 1 [sr; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sr; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sr-2021-49">Colossal OSCAR 1 [sr; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sr; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sr-2022-27">Colossal OSCAR 1 [sr; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sr; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sr-2022-49">Colossal OSCAR 1 [sr; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sr; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sr-2023-14">Colossal OSCAR 1 [sr; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sr; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sr-2023-23">Colossal OSCAR 1 [sr; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sr; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>652 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="macocu-web-corpus-serbian-10">MaCoCu web corpus [Serbian 1.0]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>macocu_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>MaCoCu web corpus [Serbian 1.0]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://www.clarin.si/repository/xmlui/handle/11356/1807]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC0-No Rights Reserved (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>2 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="srpkorsubset-news-legal-academic-conversation-literary">SrpKorSubset (news, legal, academic, conversation, literary)</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>srpkor</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>SrpKorSubset (news, legal, academic, conversation, literary)</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The Corpus of contemporary Serbian, SrpKor, consists of 4,925 texts.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>on_request</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[http://www.korpus.matf.bg.ac.rs/]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Do not redistribute, DFKI has permission to use it for pre-training LLMs (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-sr">Wikibooks [sr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [sr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-sr">Wikinews [sr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [sr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-sr">Wikipedia [sr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [sr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-sr">Wikiquote [sr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [sr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-sr">Wikisource [sr]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_sr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [sr]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_st/index.html b/datasets/language_st/index.html
new file mode 100644
index 0000000..3ca389e
--- /dev/null
+++ b/datasets/language_st/index.html
@@ -0,0 +1,669 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_st/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Southern Sotho Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#southern-sotho-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Southern Sotho Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#wura-southern-sotho" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Southern Sotho]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="southern-sotho-datasets">Southern Sotho Datasets</h1>
+<p>There are in total 1 datasets with N/A tokens in Southern Sotho language.</p>
+<h2 id="wura-southern-sotho">WURA [Southern Sotho]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_st</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Southern Sotho]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_su/index.html b/datasets/language_su/index.html
new file mode 100644
index 0000000..80c75f7
--- /dev/null
+++ b/datasets/language_su/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_su/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Sundanese Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#sundanese-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Sundanese Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-su-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [su; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-su-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [su; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-su-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [su; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-su-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [su; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-su-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [su; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-su-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [su; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-su-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [su; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-su-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [su; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-su-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [su; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-su-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [su; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-su-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [su; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-su-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [su; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="sundanese-datasets">Sundanese Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Sundanese language.</p>
+<h2 id="colossal-oscar-1-su-2015-14">Colossal OSCAR 1 [su; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_su</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [su; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-su-2016-40">Colossal OSCAR 1 [su; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_su</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [su; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-su-2017-43">Colossal OSCAR 1 [su; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_su</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [su; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-su-2018-47">Colossal OSCAR 1 [su; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_su</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [su; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-su-2019-22">Colossal OSCAR 1 [su; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_su</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [su; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-su-2020-24">Colossal OSCAR 1 [su; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_su</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [su; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-su-2020-45">Colossal OSCAR 1 [su; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_su</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [su; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-su-2021-49">Colossal OSCAR 1 [su; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_su</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [su; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-su-2022-27">Colossal OSCAR 1 [su; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_su</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [su; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-su-2022-49">Colossal OSCAR 1 [su; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_su</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [su; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-su-2023-14">Colossal OSCAR 1 [su; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_su</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [su; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-su-2023-23">Colossal OSCAR 1 [su; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_su</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [su; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_sv/index.html b/datasets/language_sv/index.html
new file mode 100644
index 0000000..4144524
--- /dev/null
+++ b/datasets/language_sv/index.html
@@ -0,0 +1,1545 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_sv/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Swedish Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#swedish-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Swedish Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sv-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sv; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sv-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sv; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sv-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sv; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sv-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sv; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sv-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sv; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sv-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sv; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sv-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sv; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sv-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sv; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sv-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sv; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sv-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sv; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sv-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sv; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sv-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sv; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#eurlexresources-sv" class="md-nav__link">
+    <span class="md-ellipsis">
+      EurlexResources [sv]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#legalmc4-sv" class="md-nav__link">
+    <span class="md-ellipsis">
+      LegalMC4 [sv]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#the-swedish-culturomics-gigaword-corpus" class="md-nav__link">
+    <span class="md-ellipsis">
+      The Swedish Culturomics Gigaword Corpus
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-sv" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [sv]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-sv" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [sv]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-sv" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [sv]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-sv" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [sv]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-sv" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [sv]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-sv" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [sv]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="swedish-datasets">Swedish Datasets</h1>
+<p>There are in total 21 datasets with 13 B tokens in Swedish language.</p>
+<h2 id="colossal-oscar-1-sv-2015-14">Colossal OSCAR 1 [sv; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sv; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sv-2016-40">Colossal OSCAR 1 [sv; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sv; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sv-2017-43">Colossal OSCAR 1 [sv; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sv; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sv-2018-47">Colossal OSCAR 1 [sv; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sv; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sv-2019-22">Colossal OSCAR 1 [sv; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sv; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sv-2020-24">Colossal OSCAR 1 [sv; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sv; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sv-2020-45">Colossal OSCAR 1 [sv; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sv; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sv-2021-49">Colossal OSCAR 1 [sv; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sv; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sv-2022-27">Colossal OSCAR 1 [sv; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sv; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sv-2022-49">Colossal OSCAR 1 [sv; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sv; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sv-2023-14">Colossal OSCAR 1 [sv; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sv; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sv-2023-23">Colossal OSCAR 1 [sv; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sv; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>6 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="eurlexresources-sv">EurlexResources [sv]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>eurlex_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>EurlexResources [sv]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A Corpus Covering the Largest EURLEX Resources.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/eurlex_resources]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>5 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="legalmc4-sv">LegalMC4 [sv]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>legal_mc4_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>LegalMC4 [sv]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/joelito/legal-mc4]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>328 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="the-swedish-culturomics-gigaword-corpus">The Swedish Culturomics Gigaword Corpus</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>sv_gigaword</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>The Swedish Culturomics Gigaword Corpus</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>One billion Swedish words from 1950 and onwards. Code to extract data from the corpus, as well as usage instructions, can be downloaded from https://svn.spraakdata.gu.se/sb-arkiv/tools/gigaword/</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://spraakbanken.gu.se/en/resources/gigaword]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>BY-SA 4.0 (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-sv">Wikibooks [sv]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [sv]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>3 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-sv">Wikinews [sv]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [sv]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>540 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-sv">Wikipedia [sv]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [sv]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>130 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-sv">Wikiquote [sv]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [sv]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>381 k</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-sv">Wikisource [sv]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [sv]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>9 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-sv">Wikivoyage [sv]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_sv</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [sv]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>1 M</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_sw/index.html b/datasets/language_sw/index.html
new file mode 100644
index 0000000..cdb267d
--- /dev/null
+++ b/datasets/language_sw/index.html
@@ -0,0 +1,1197 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_sw/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Swahili Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#swahili-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Swahili Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sw-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sw; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sw-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sw; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sw-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sw; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sw-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sw; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sw-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sw; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sw-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sw; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sw-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sw; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sw-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sw; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sw-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sw; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sw-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sw; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sw-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sw; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-sw-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [sw; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wura-swahili" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Swahili]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="swahili-datasets">Swahili Datasets</h1>
+<p>There are in total 13 datasets with N/A tokens in Swahili language.</p>
+<h2 id="colossal-oscar-1-sw-2015-14">Colossal OSCAR 1 [sw; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_sw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sw; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sw-2016-40">Colossal OSCAR 1 [sw; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_sw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sw; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sw-2017-43">Colossal OSCAR 1 [sw; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_sw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sw; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sw-2018-47">Colossal OSCAR 1 [sw; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_sw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sw; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sw-2019-22">Colossal OSCAR 1 [sw; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_sw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sw; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sw-2020-24">Colossal OSCAR 1 [sw; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_sw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sw; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sw-2020-45">Colossal OSCAR 1 [sw; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_sw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sw; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sw-2021-49">Colossal OSCAR 1 [sw; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_sw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sw; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sw-2022-27">Colossal OSCAR 1 [sw; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_sw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sw; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sw-2022-49">Colossal OSCAR 1 [sw; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_sw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sw; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sw-2023-14">Colossal OSCAR 1 [sw; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_sw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sw; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-sw-2023-23">Colossal OSCAR 1 [sw; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_sw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [sw; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wura-swahili">WURA [Swahili]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_sw</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Swahili]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ta/index.html b/datasets/language_ta/index.html
new file mode 100644
index 0000000..ce33cd9
--- /dev/null
+++ b/datasets/language_ta/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ta/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Tamil Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#tamil-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Tamil Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ta-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ta; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ta-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ta; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ta-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ta; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ta-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ta; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ta-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ta; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ta-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ta; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ta-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ta; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ta-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ta; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ta-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ta; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ta-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ta; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ta-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ta; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ta-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ta; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="tamil-datasets">Tamil Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Tamil language.</p>
+<h2 id="colossal-oscar-1-ta-2015-14">Colossal OSCAR 1 [ta; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ta</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ta; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ta-2016-40">Colossal OSCAR 1 [ta; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ta</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ta; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ta-2017-43">Colossal OSCAR 1 [ta; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ta</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ta; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ta-2018-47">Colossal OSCAR 1 [ta; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ta</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ta; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ta-2019-22">Colossal OSCAR 1 [ta; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ta</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ta; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ta-2020-24">Colossal OSCAR 1 [ta; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ta</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ta; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ta-2020-45">Colossal OSCAR 1 [ta; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ta</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ta; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ta-2021-49">Colossal OSCAR 1 [ta; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ta</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ta; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ta-2022-27">Colossal OSCAR 1 [ta; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ta</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ta; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ta-2022-49">Colossal OSCAR 1 [ta; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ta</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ta; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ta-2023-14">Colossal OSCAR 1 [ta; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ta</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ta; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ta-2023-23">Colossal OSCAR 1 [ta; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ta</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ta; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_te/index.html b/datasets/language_te/index.html
new file mode 100644
index 0000000..3177d68
--- /dev/null
+++ b/datasets/language_te/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_te/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Telugu Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#telugu-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Telugu Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-te-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [te; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-te-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [te; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-te-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [te; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-te-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [te; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-te-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [te; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-te-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [te; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-te-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [te; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-te-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [te; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-te-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [te; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-te-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [te; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-te-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [te; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-te-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [te; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="telugu-datasets">Telugu Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Telugu language.</p>
+<h2 id="colossal-oscar-1-te-2015-14">Colossal OSCAR 1 [te; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_te</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [te; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-te-2016-40">Colossal OSCAR 1 [te; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_te</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [te; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-te-2017-43">Colossal OSCAR 1 [te; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_te</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [te; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-te-2018-47">Colossal OSCAR 1 [te; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_te</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [te; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-te-2019-22">Colossal OSCAR 1 [te; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_te</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [te; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-te-2020-24">Colossal OSCAR 1 [te; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_te</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [te; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-te-2020-45">Colossal OSCAR 1 [te; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_te</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [te; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-te-2021-49">Colossal OSCAR 1 [te; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_te</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [te; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-te-2022-27">Colossal OSCAR 1 [te; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_te</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [te; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-te-2022-49">Colossal OSCAR 1 [te; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_te</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [te; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-te-2023-14">Colossal OSCAR 1 [te; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_te</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [te; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-te-2023-23">Colossal OSCAR 1 [te; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_te</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [te; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_tg/index.html b/datasets/language_tg/index.html
new file mode 100644
index 0000000..a60fb8d
--- /dev/null
+++ b/datasets/language_tg/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_tg/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Tajik Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#tajik-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Tajik Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tg-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tg; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tg-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tg; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tg-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tg; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tg-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tg; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tg-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tg; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tg-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tg; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tg-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tg; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tg-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tg; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tg-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tg; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tg-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tg; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tg-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tg; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tg-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tg; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="tajik-datasets">Tajik Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Tajik language.</p>
+<h2 id="colossal-oscar-1-tg-2015-14">Colossal OSCAR 1 [tg; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_tg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tg; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tg-2016-40">Colossal OSCAR 1 [tg; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_tg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tg; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tg-2017-43">Colossal OSCAR 1 [tg; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_tg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tg; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tg-2018-47">Colossal OSCAR 1 [tg; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_tg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tg; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tg-2019-22">Colossal OSCAR 1 [tg; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_tg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tg; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tg-2020-24">Colossal OSCAR 1 [tg; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_tg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tg; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tg-2020-45">Colossal OSCAR 1 [tg; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_tg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tg; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tg-2021-49">Colossal OSCAR 1 [tg; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_tg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tg; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tg-2022-27">Colossal OSCAR 1 [tg; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_tg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tg; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tg-2022-49">Colossal OSCAR 1 [tg; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_tg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tg; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tg-2023-14">Colossal OSCAR 1 [tg; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_tg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tg; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tg-2023-23">Colossal OSCAR 1 [tg; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_tg</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tg; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_th/index.html b/datasets/language_th/index.html
new file mode 100644
index 0000000..00de40a
--- /dev/null
+++ b/datasets/language_th/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_th/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Thai Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#thai-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Thai Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-th-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [th; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-th-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [th; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-th-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [th; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-th-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [th; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-th-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [th; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-th-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [th; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-th-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [th; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-th-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [th; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-th-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [th; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-th-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [th; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-th-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [th; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-th-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [th; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="thai-datasets">Thai Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Thai language.</p>
+<h2 id="colossal-oscar-1-th-2015-14">Colossal OSCAR 1 [th; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_th</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [th; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-th-2016-40">Colossal OSCAR 1 [th; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_th</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [th; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-th-2017-43">Colossal OSCAR 1 [th; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_th</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [th; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-th-2018-47">Colossal OSCAR 1 [th; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_th</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [th; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-th-2019-22">Colossal OSCAR 1 [th; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_th</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [th; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-th-2020-24">Colossal OSCAR 1 [th; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_th</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [th; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-th-2020-45">Colossal OSCAR 1 [th; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_th</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [th; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-th-2021-49">Colossal OSCAR 1 [th; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_th</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [th; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-th-2022-27">Colossal OSCAR 1 [th; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_th</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [th; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-th-2022-49">Colossal OSCAR 1 [th; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_th</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [th; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-th-2023-14">Colossal OSCAR 1 [th; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_th</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [th; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-th-2023-23">Colossal OSCAR 1 [th; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_th</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [th; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ti/index.html b/datasets/language_ti/index.html
new file mode 100644
index 0000000..925ef57
--- /dev/null
+++ b/datasets/language_ti/index.html
@@ -0,0 +1,669 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ti/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Tigrinya Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#tigrinya-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Tigrinya Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#wura-tigrinya" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Tigrinya]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="tigrinya-datasets">Tigrinya Datasets</h1>
+<p>There are in total 1 datasets with N/A tokens in Tigrinya language.</p>
+<h2 id="wura-tigrinya">WURA [Tigrinya]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_ti</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Tigrinya]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_tk/index.html b/datasets/language_tk/index.html
new file mode 100644
index 0000000..c07fc9c
--- /dev/null
+++ b/datasets/language_tk/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_tk/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Turkmen Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#turkmen-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Turkmen Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tk-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tk; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tk-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tk; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tk-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tk; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tk-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tk; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tk-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tk; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tk-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tk; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tk-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tk; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tk-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tk; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tk-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tk; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tk-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tk; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tk-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tk; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tk-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tk; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="turkmen-datasets">Turkmen Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Turkmen language.</p>
+<h2 id="colossal-oscar-1-tk-2015-14">Colossal OSCAR 1 [tk; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_tk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tk; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tk-2016-40">Colossal OSCAR 1 [tk; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_tk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tk; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tk-2017-43">Colossal OSCAR 1 [tk; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_tk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tk; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tk-2018-47">Colossal OSCAR 1 [tk; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_tk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tk; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tk-2019-22">Colossal OSCAR 1 [tk; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_tk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tk; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tk-2020-24">Colossal OSCAR 1 [tk; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_tk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tk; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tk-2020-45">Colossal OSCAR 1 [tk; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_tk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tk; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tk-2021-49">Colossal OSCAR 1 [tk; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_tk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tk; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tk-2022-27">Colossal OSCAR 1 [tk; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_tk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tk; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tk-2022-49">Colossal OSCAR 1 [tk; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_tk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tk; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tk-2023-14">Colossal OSCAR 1 [tk; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_tk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tk; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tk-2023-23">Colossal OSCAR 1 [tk; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_tk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tk; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_tl/index.html b/datasets/language_tl/index.html
new file mode 100644
index 0000000..724df28
--- /dev/null
+++ b/datasets/language_tl/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_tl/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Tagalog Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#tagalog-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Tagalog Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tl-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tl; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tl-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tl; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tl-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tl; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tl-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tl; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tl-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tl; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tl-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tl; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tl-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tl; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tl-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tl; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tl-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tl; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tl-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tl; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tl-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tl; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tl-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tl; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="tagalog-datasets">Tagalog Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Tagalog language.</p>
+<h2 id="colossal-oscar-1-tl-2015-14">Colossal OSCAR 1 [tl; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_tl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tl; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tl-2016-40">Colossal OSCAR 1 [tl; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_tl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tl; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tl-2017-43">Colossal OSCAR 1 [tl; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_tl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tl; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tl-2018-47">Colossal OSCAR 1 [tl; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_tl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tl; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tl-2019-22">Colossal OSCAR 1 [tl; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_tl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tl; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tl-2020-24">Colossal OSCAR 1 [tl; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_tl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tl; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tl-2020-45">Colossal OSCAR 1 [tl; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_tl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tl; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tl-2021-49">Colossal OSCAR 1 [tl; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_tl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tl; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tl-2022-27">Colossal OSCAR 1 [tl; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_tl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tl; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tl-2022-49">Colossal OSCAR 1 [tl; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_tl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tl; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tl-2023-14">Colossal OSCAR 1 [tl; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_tl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tl; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tl-2023-23">Colossal OSCAR 1 [tl; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_tl</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tl; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_tr/index.html b/datasets/language_tr/index.html
new file mode 100644
index 0000000..5284924
--- /dev/null
+++ b/datasets/language_tr/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_tr/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Turkish Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#turkish-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Turkish Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tr-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tr; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tr-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tr; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tr-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tr; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tr-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tr; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tr-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tr; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tr-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tr; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tr-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tr; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tr-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tr; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tr-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tr; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tr-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tr; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tr-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tr; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tr-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tr; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="turkish-datasets">Turkish Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Turkish language.</p>
+<h2 id="colossal-oscar-1-tr-2015-14">Colossal OSCAR 1 [tr; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_tr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tr; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tr-2016-40">Colossal OSCAR 1 [tr; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_tr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tr; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tr-2017-43">Colossal OSCAR 1 [tr; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_tr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tr; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tr-2018-47">Colossal OSCAR 1 [tr; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_tr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tr; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tr-2019-22">Colossal OSCAR 1 [tr; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_tr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tr; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tr-2020-24">Colossal OSCAR 1 [tr; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_tr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tr; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tr-2020-45">Colossal OSCAR 1 [tr; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_tr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tr; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tr-2021-49">Colossal OSCAR 1 [tr; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_tr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tr; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tr-2022-27">Colossal OSCAR 1 [tr; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_tr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tr; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tr-2022-49">Colossal OSCAR 1 [tr; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_tr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tr; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tr-2023-14">Colossal OSCAR 1 [tr; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_tr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tr; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tr-2023-23">Colossal OSCAR 1 [tr; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_tr</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tr; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_tt/index.html b/datasets/language_tt/index.html
new file mode 100644
index 0000000..c6c304e
--- /dev/null
+++ b/datasets/language_tt/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_tt/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Tatar Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#tatar-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Tatar Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tt-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tt; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tt-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tt; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tt-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tt; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tt-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tt; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tt-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tt; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tt-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tt; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tt-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tt; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tt-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tt; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tt-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tt; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tt-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tt; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tt-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tt; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-tt-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [tt; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="tatar-datasets">Tatar Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Tatar language.</p>
+<h2 id="colossal-oscar-1-tt-2015-14">Colossal OSCAR 1 [tt; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_tt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tt; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tt-2016-40">Colossal OSCAR 1 [tt; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_tt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tt; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tt-2017-43">Colossal OSCAR 1 [tt; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_tt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tt; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tt-2018-47">Colossal OSCAR 1 [tt; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_tt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tt; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tt-2019-22">Colossal OSCAR 1 [tt; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_tt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tt; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tt-2020-24">Colossal OSCAR 1 [tt; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_tt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tt; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tt-2020-45">Colossal OSCAR 1 [tt; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_tt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tt; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tt-2021-49">Colossal OSCAR 1 [tt; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_tt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tt; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tt-2022-27">Colossal OSCAR 1 [tt; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_tt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tt; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tt-2022-49">Colossal OSCAR 1 [tt; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_tt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tt; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tt-2023-14">Colossal OSCAR 1 [tt; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_tt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tt; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-tt-2023-23">Colossal OSCAR 1 [tt; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_tt</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [tt; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ug/index.html b/datasets/language_ug/index.html
new file mode 100644
index 0000000..70bb83e
--- /dev/null
+++ b/datasets/language_ug/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ug/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Uighur Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#uighur-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Uighur Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ug-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ug; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ug-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ug; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ug-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ug; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ug-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ug; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ug-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ug; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ug-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ug; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ug-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ug; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ug-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ug; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ug-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ug; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ug-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ug; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ug-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ug; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ug-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ug; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="uighur-datasets">Uighur Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Uighur language.</p>
+<h2 id="colossal-oscar-1-ug-2015-14">Colossal OSCAR 1 [ug; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ug</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ug; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ug-2016-40">Colossal OSCAR 1 [ug; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ug</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ug; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ug-2017-43">Colossal OSCAR 1 [ug; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ug</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ug; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ug-2018-47">Colossal OSCAR 1 [ug; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ug</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ug; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ug-2019-22">Colossal OSCAR 1 [ug; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ug</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ug; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ug-2020-24">Colossal OSCAR 1 [ug; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ug</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ug; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ug-2020-45">Colossal OSCAR 1 [ug; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ug</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ug; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ug-2021-49">Colossal OSCAR 1 [ug; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ug</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ug; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ug-2022-27">Colossal OSCAR 1 [ug; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ug</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ug; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ug-2022-49">Colossal OSCAR 1 [ug; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ug</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ug; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ug-2023-14">Colossal OSCAR 1 [ug; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ug</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ug; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ug-2023-23">Colossal OSCAR 1 [ug; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ug</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ug; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_uk/index.html b/datasets/language_uk/index.html
new file mode 100644
index 0000000..b491346
--- /dev/null
+++ b/datasets/language_uk/index.html
@@ -0,0 +1,1501 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_uk/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Ukrainian Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#ukrainian-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Ukrainian Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uk-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uk; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uk-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uk; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uk-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uk; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uk-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uk; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uk-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uk; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uk-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uk; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uk-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uk; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uk-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uk; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uk-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uk; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uk-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uk; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uk-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uk; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uk-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uk; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#corpus-of-laws-and-legal-acts-of-ukraine" class="md-nav__link">
+    <span class="md-ellipsis">
+      Corpus of laws and legal acts of Ukraine
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#macocu-web-corpus-ukrainian-10" class="md-nav__link">
+    <span class="md-ellipsis">
+      MaCoCu web corpus [Ukrainian 1.0]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikibooks-uk" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikibooks [uk]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikinews-uk" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikinews [uk]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikipedia-uk" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikipedia [uk]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikiquote-uk" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikiquote [uk]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikisource-uk" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikisource [uk]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wikivoyage-uk" class="md-nav__link">
+    <span class="md-ellipsis">
+      Wikivoyage [uk]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="ukrainian-datasets">Ukrainian Datasets</h1>
+<p>There are in total 20 datasets with 11 B tokens in Ukrainian language.</p>
+<h2 id="colossal-oscar-1-uk-2015-14">Colossal OSCAR 1 [uk; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uk; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uk-2016-40">Colossal OSCAR 1 [uk; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uk; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uk-2017-43">Colossal OSCAR 1 [uk; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uk; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uk-2018-47">Colossal OSCAR 1 [uk; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uk; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uk-2019-22">Colossal OSCAR 1 [uk; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uk; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uk-2020-24">Colossal OSCAR 1 [uk; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uk; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uk-2020-45">Colossal OSCAR 1 [uk; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uk; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uk-2021-49">Colossal OSCAR 1 [uk; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uk; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uk-2022-27">Colossal OSCAR 1 [uk; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uk; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uk-2022-49">Colossal OSCAR 1 [uk; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uk; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uk-2023-14">Colossal OSCAR 1 [uk; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uk; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uk-2023-23">Colossal OSCAR 1 [uk; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uk; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>4 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="corpus-of-laws-and-legal-acts-of-ukraine">Corpus of laws and legal acts of Ukraine</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>uk_laws</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Corpus of laws and legal acts of Ukraine</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>A large (more than 9 Gb) corpus of laws and legal acts of Ukraine.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://lang.org.ua/en/corpora/#anchor7]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Unknown, likely public domain (commercial use: None, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>579 M</td>
+</tr>
+</tbody>
+</table>
+<h2 id="macocu-web-corpus-ukrainian-10">MaCoCu web corpus [Ukrainian 1.0]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>macocu_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>MaCoCu web corpus [Ukrainian 1.0]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://www.clarin.si/repository/xmlui/handle/11356/1838]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CC0-No Rights Reserved (commercial use: True, sharealike: False)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>6 B</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikibooks-uk">Wikibooks [uk]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikibooks_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikibooks [uk]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The open-content textbooks collection that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikibooks.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikinews-uk">Wikinews [uk]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikinews_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikinews [uk]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>News written by volunteers.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikinews.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikipedia-uk">Wikipedia [uk]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wiki_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikipedia [uk]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free encyclopedia that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikipedia.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikiquote-uk">Wikiquote [uk]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikiquote_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikiquote [uk]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free quote compendium that anyone can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikiquote.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikisource-uk">Wikisource [uk]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikisource_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikisource [uk]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free library that anyone can improve.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikisource.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wikivoyage-uk">Wikivoyage [uk]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wikivoyage_uk</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Wikivoyage [uk]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The free worldwide travel guide that you can edit.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://en.wikivoyage.org/wiki/Main_Page]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_ur/index.html b/datasets/language_ur/index.html
new file mode 100644
index 0000000..f9121a0
--- /dev/null
+++ b/datasets/language_ur/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_ur/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Urdu Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#urdu-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Urdu Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ur-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ur; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ur-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ur; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ur-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ur; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ur-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ur; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ur-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ur; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ur-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ur; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ur-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ur; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ur-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ur; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ur-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ur; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ur-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ur; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ur-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ur; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-ur-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [ur; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="urdu-datasets">Urdu Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Urdu language.</p>
+<h2 id="colossal-oscar-1-ur-2015-14">Colossal OSCAR 1 [ur; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_ur</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ur; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ur-2016-40">Colossal OSCAR 1 [ur; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_ur</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ur; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ur-2017-43">Colossal OSCAR 1 [ur; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_ur</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ur; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ur-2018-47">Colossal OSCAR 1 [ur; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_ur</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ur; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ur-2019-22">Colossal OSCAR 1 [ur; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_ur</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ur; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ur-2020-24">Colossal OSCAR 1 [ur; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_ur</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ur; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ur-2020-45">Colossal OSCAR 1 [ur; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_ur</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ur; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ur-2021-49">Colossal OSCAR 1 [ur; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_ur</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ur; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ur-2022-27">Colossal OSCAR 1 [ur; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_ur</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ur; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ur-2022-49">Colossal OSCAR 1 [ur; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_ur</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ur; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ur-2023-14">Colossal OSCAR 1 [ur; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_ur</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ur; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-ur-2023-23">Colossal OSCAR 1 [ur; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_ur</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [ur; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_uz/index.html b/datasets/language_uz/index.html
new file mode 100644
index 0000000..5fdb78d
--- /dev/null
+++ b/datasets/language_uz/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_uz/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Uzbek Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#uzbek-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Uzbek Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uz-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uz; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uz-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uz; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uz-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uz; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uz-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uz; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uz-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uz; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uz-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uz; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uz-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uz; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uz-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uz; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uz-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uz; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uz-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uz; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uz-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uz; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-uz-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [uz; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="uzbek-datasets">Uzbek Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Uzbek language.</p>
+<h2 id="colossal-oscar-1-uz-2015-14">Colossal OSCAR 1 [uz; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_uz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uz; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uz-2016-40">Colossal OSCAR 1 [uz; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_uz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uz; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uz-2017-43">Colossal OSCAR 1 [uz; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_uz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uz; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uz-2018-47">Colossal OSCAR 1 [uz; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_uz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uz; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uz-2019-22">Colossal OSCAR 1 [uz; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_uz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uz; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uz-2020-24">Colossal OSCAR 1 [uz; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_uz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uz; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uz-2020-45">Colossal OSCAR 1 [uz; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_uz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uz; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uz-2021-49">Colossal OSCAR 1 [uz; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_uz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uz; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uz-2022-27">Colossal OSCAR 1 [uz; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_uz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uz; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uz-2022-49">Colossal OSCAR 1 [uz; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_uz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uz; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uz-2023-14">Colossal OSCAR 1 [uz; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_uz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uz; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-uz-2023-23">Colossal OSCAR 1 [uz; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_uz</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [uz; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_vi/index.html b/datasets/language_vi/index.html
new file mode 100644
index 0000000..a936106
--- /dev/null
+++ b/datasets/language_vi/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_vi/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Vietnamese Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#vietnamese-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Vietnamese Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vi-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vi; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vi-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vi; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vi-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vi; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vi-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vi; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vi-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vi; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vi-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vi; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vi-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vi; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vi-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vi; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vi-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vi; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vi-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vi; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vi-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vi; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vi-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vi; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="vietnamese-datasets">Vietnamese Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Vietnamese language.</p>
+<h2 id="colossal-oscar-1-vi-2015-14">Colossal OSCAR 1 [vi; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_vi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vi; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vi-2016-40">Colossal OSCAR 1 [vi; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_vi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vi; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vi-2017-43">Colossal OSCAR 1 [vi; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_vi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vi; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vi-2018-47">Colossal OSCAR 1 [vi; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_vi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vi; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vi-2019-22">Colossal OSCAR 1 [vi; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_vi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vi; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vi-2020-24">Colossal OSCAR 1 [vi; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_vi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vi; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vi-2020-45">Colossal OSCAR 1 [vi; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_vi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vi; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vi-2021-49">Colossal OSCAR 1 [vi; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_vi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vi; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vi-2022-27">Colossal OSCAR 1 [vi; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_vi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vi; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vi-2022-49">Colossal OSCAR 1 [vi; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_vi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vi; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vi-2023-14">Colossal OSCAR 1 [vi; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_vi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vi; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vi-2023-23">Colossal OSCAR 1 [vi; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_vi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vi; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_vo/index.html b/datasets/language_vo/index.html
new file mode 100644
index 0000000..1fa850d
--- /dev/null
+++ b/datasets/language_vo/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_vo/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Volapük Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#volapuk-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Volapük Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vo-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vo; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vo-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vo; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vo-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vo; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vo-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vo; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vo-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vo; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vo-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vo; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vo-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vo; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vo-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vo; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vo-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vo; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vo-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vo; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vo-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vo; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-vo-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [vo; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="volapuk-datasets">Volapük Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Volapük language.</p>
+<h2 id="colossal-oscar-1-vo-2015-14">Colossal OSCAR 1 [vo; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_vo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vo; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vo-2016-40">Colossal OSCAR 1 [vo; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_vo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vo; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vo-2017-43">Colossal OSCAR 1 [vo; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_vo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vo; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vo-2018-47">Colossal OSCAR 1 [vo; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_vo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vo; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vo-2019-22">Colossal OSCAR 1 [vo; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_vo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vo; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vo-2020-24">Colossal OSCAR 1 [vo; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_vo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vo; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vo-2020-45">Colossal OSCAR 1 [vo; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_vo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vo; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vo-2021-49">Colossal OSCAR 1 [vo; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_vo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vo; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vo-2022-27">Colossal OSCAR 1 [vo; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_vo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vo; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vo-2022-49">Colossal OSCAR 1 [vo; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_vo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vo; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vo-2023-14">Colossal OSCAR 1 [vo; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_vo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vo; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-vo-2023-23">Colossal OSCAR 1 [vo; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_vo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [vo; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_wa/index.html b/datasets/language_wa/index.html
new file mode 100644
index 0000000..4c4f058
--- /dev/null
+++ b/datasets/language_wa/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_wa/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Walloon Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#walloon-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Walloon Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wa-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wa; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wa-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wa; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wa-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wa; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wa-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wa; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wa-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wa; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wa-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wa; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wa-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wa; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wa-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wa; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wa-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wa; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wa-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wa; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wa-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wa; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wa-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wa; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="walloon-datasets">Walloon Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Walloon language.</p>
+<h2 id="colossal-oscar-1-wa-2015-14">Colossal OSCAR 1 [wa; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_wa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wa; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wa-2016-40">Colossal OSCAR 1 [wa; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_wa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wa; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wa-2017-43">Colossal OSCAR 1 [wa; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_wa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wa; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wa-2018-47">Colossal OSCAR 1 [wa; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_wa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wa; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wa-2019-22">Colossal OSCAR 1 [wa; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_wa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wa; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wa-2020-24">Colossal OSCAR 1 [wa; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_wa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wa; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wa-2020-45">Colossal OSCAR 1 [wa; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_wa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wa; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wa-2021-49">Colossal OSCAR 1 [wa; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_wa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wa; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wa-2022-27">Colossal OSCAR 1 [wa; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_wa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wa; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wa-2022-49">Colossal OSCAR 1 [wa; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_wa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wa; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wa-2023-14">Colossal OSCAR 1 [wa; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_wa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wa; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wa-2023-23">Colossal OSCAR 1 [wa; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_wa</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wa; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_war/index.html b/datasets/language_war/index.html
new file mode 100644
index 0000000..79ecbd3
--- /dev/null
+++ b/datasets/language_war/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_war/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>War Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#war-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              War Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-war-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [war; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-war-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [war; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-war-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [war; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-war-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [war; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-war-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [war; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-war-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [war; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-war-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [war; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-war-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [war; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-war-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [war; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-war-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [war; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-war-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [war; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-war-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [war; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="war-datasets">War Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in War language.</p>
+<h2 id="colossal-oscar-1-war-2015-14">Colossal OSCAR 1 [war; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_war</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [war; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-war-2016-40">Colossal OSCAR 1 [war; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_war</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [war; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-war-2017-43">Colossal OSCAR 1 [war; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_war</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [war; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-war-2018-47">Colossal OSCAR 1 [war; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_war</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [war; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-war-2019-22">Colossal OSCAR 1 [war; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_war</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [war; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-war-2020-24">Colossal OSCAR 1 [war; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_war</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [war; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-war-2020-45">Colossal OSCAR 1 [war; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_war</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [war; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-war-2021-49">Colossal OSCAR 1 [war; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_war</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [war; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-war-2022-27">Colossal OSCAR 1 [war; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_war</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [war; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-war-2022-49">Colossal OSCAR 1 [war; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_war</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [war; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-war-2023-14">Colossal OSCAR 1 [war; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_war</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [war; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-war-2023-23">Colossal OSCAR 1 [war; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_war</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [war; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_wuu/index.html b/datasets/language_wuu/index.html
new file mode 100644
index 0000000..5449095
--- /dev/null
+++ b/datasets/language_wuu/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_wuu/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Wuu Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#wuu-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Wuu Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wuu-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wuu; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wuu-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wuu; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wuu-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wuu; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wuu-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wuu; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wuu-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wuu; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wuu-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wuu; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wuu-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wuu; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wuu-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wuu; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wuu-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wuu; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wuu-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wuu; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wuu-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wuu; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-wuu-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [wuu; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="wuu-datasets">Wuu Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Wuu language.</p>
+<h2 id="colossal-oscar-1-wuu-2015-14">Colossal OSCAR 1 [wuu; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_wuu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wuu; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wuu-2016-40">Colossal OSCAR 1 [wuu; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_wuu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wuu; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wuu-2017-43">Colossal OSCAR 1 [wuu; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_wuu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wuu; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wuu-2018-47">Colossal OSCAR 1 [wuu; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_wuu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wuu; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wuu-2019-22">Colossal OSCAR 1 [wuu; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_wuu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wuu; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wuu-2020-24">Colossal OSCAR 1 [wuu; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_wuu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wuu; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wuu-2020-45">Colossal OSCAR 1 [wuu; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_wuu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wuu; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wuu-2021-49">Colossal OSCAR 1 [wuu; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_wuu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wuu; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wuu-2022-27">Colossal OSCAR 1 [wuu; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_wuu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wuu; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wuu-2022-49">Colossal OSCAR 1 [wuu; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_wuu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wuu; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wuu-2023-14">Colossal OSCAR 1 [wuu; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_wuu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wuu; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-wuu-2023-23">Colossal OSCAR 1 [wuu; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_wuu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [wuu; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_x-eml/index.html b/datasets/language_x-eml/index.html
new file mode 100644
index 0000000..b93d893
--- /dev/null
+++ b/datasets/language_x-eml/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_x-eml/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>X-Eml Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#x-eml-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              X-Eml Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-x-eml-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [x-eml; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-x-eml-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [x-eml; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-x-eml-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [x-eml; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-x-eml-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [x-eml; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-x-eml-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [x-eml; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-x-eml-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [x-eml; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-x-eml-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [x-eml; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-x-eml-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [x-eml; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-x-eml-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [x-eml; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-x-eml-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [x-eml; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-x-eml-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [x-eml; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-x-eml-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [x-eml; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="x-eml-datasets">X-Eml Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in X-Eml language.</p>
+<h2 id="colossal-oscar-1-x-eml-2015-14">Colossal OSCAR 1 [x-eml; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_x-eml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [x-eml; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-x-eml-2016-40">Colossal OSCAR 1 [x-eml; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_x-eml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [x-eml; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-x-eml-2017-43">Colossal OSCAR 1 [x-eml; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_x-eml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [x-eml; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-x-eml-2018-47">Colossal OSCAR 1 [x-eml; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_x-eml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [x-eml; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-x-eml-2019-22">Colossal OSCAR 1 [x-eml; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_x-eml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [x-eml; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-x-eml-2020-24">Colossal OSCAR 1 [x-eml; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_x-eml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [x-eml; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-x-eml-2020-45">Colossal OSCAR 1 [x-eml; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_x-eml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [x-eml; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-x-eml-2021-49">Colossal OSCAR 1 [x-eml; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_x-eml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [x-eml; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-x-eml-2022-27">Colossal OSCAR 1 [x-eml; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_x-eml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [x-eml; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-x-eml-2022-49">Colossal OSCAR 1 [x-eml; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_x-eml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [x-eml; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-x-eml-2023-14">Colossal OSCAR 1 [x-eml; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_x-eml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [x-eml; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-x-eml-2023-23">Colossal OSCAR 1 [x-eml; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_x-eml</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [x-eml; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_xal/index.html b/datasets/language_xal/index.html
new file mode 100644
index 0000000..fb53494
--- /dev/null
+++ b/datasets/language_xal/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_xal/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Xal Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#xal-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Xal Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xal-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xal; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xal-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xal; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xal-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xal; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xal-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xal; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xal-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xal; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xal-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xal; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xal-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xal; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xal-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xal; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xal-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xal; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xal-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xal; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xal-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xal; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xal-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xal; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="xal-datasets">Xal Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Xal language.</p>
+<h2 id="colossal-oscar-1-xal-2015-14">Colossal OSCAR 1 [xal; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_xal</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xal; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xal-2016-40">Colossal OSCAR 1 [xal; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_xal</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xal; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xal-2017-43">Colossal OSCAR 1 [xal; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_xal</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xal; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xal-2018-47">Colossal OSCAR 1 [xal; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_xal</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xal; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xal-2019-22">Colossal OSCAR 1 [xal; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_xal</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xal; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xal-2020-24">Colossal OSCAR 1 [xal; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_xal</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xal; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xal-2020-45">Colossal OSCAR 1 [xal; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_xal</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xal; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xal-2021-49">Colossal OSCAR 1 [xal; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_xal</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xal; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xal-2022-27">Colossal OSCAR 1 [xal; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_xal</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xal; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xal-2022-49">Colossal OSCAR 1 [xal; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_xal</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xal; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xal-2023-14">Colossal OSCAR 1 [xal; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_xal</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xal; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xal-2023-23">Colossal OSCAR 1 [xal; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_xal</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xal; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_xh/index.html b/datasets/language_xh/index.html
new file mode 100644
index 0000000..286014a
--- /dev/null
+++ b/datasets/language_xh/index.html
@@ -0,0 +1,669 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_xh/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Xhosa Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#xhosa-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Xhosa Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#wura-xhosa" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Xhosa]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="xhosa-datasets">Xhosa Datasets</h1>
+<p>There are in total 1 datasets with N/A tokens in Xhosa language.</p>
+<h2 id="wura-xhosa">WURA [Xhosa]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_xh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Xhosa]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_xmf/index.html b/datasets/language_xmf/index.html
new file mode 100644
index 0000000..462a950
--- /dev/null
+++ b/datasets/language_xmf/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_xmf/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Xmf Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#xmf-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Xmf Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xmf-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xmf; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xmf-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xmf; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xmf-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xmf; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xmf-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xmf; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xmf-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xmf; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xmf-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xmf; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xmf-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xmf; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xmf-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xmf; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xmf-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xmf; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xmf-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xmf; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xmf-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xmf; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-xmf-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [xmf; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="xmf-datasets">Xmf Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Xmf language.</p>
+<h2 id="colossal-oscar-1-xmf-2015-14">Colossal OSCAR 1 [xmf; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_xmf</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xmf; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xmf-2016-40">Colossal OSCAR 1 [xmf; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_xmf</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xmf; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xmf-2017-43">Colossal OSCAR 1 [xmf; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_xmf</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xmf; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xmf-2018-47">Colossal OSCAR 1 [xmf; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_xmf</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xmf; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xmf-2019-22">Colossal OSCAR 1 [xmf; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_xmf</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xmf; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xmf-2020-24">Colossal OSCAR 1 [xmf; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_xmf</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xmf; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xmf-2020-45">Colossal OSCAR 1 [xmf; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_xmf</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xmf; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xmf-2021-49">Colossal OSCAR 1 [xmf; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_xmf</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xmf; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xmf-2022-27">Colossal OSCAR 1 [xmf; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_xmf</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xmf; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xmf-2022-49">Colossal OSCAR 1 [xmf; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_xmf</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xmf; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xmf-2023-14">Colossal OSCAR 1 [xmf; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_xmf</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xmf; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-xmf-2023-23">Colossal OSCAR 1 [xmf; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_xmf</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [xmf; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_yi/index.html b/datasets/language_yi/index.html
new file mode 100644
index 0000000..cff7764
--- /dev/null
+++ b/datasets/language_yi/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_yi/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Yiddish Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#yiddish-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Yiddish Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yi-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yi; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yi-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yi; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yi-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yi; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yi-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yi; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yi-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yi; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yi-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yi; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yi-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yi; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yi-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yi; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yi-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yi; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yi-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yi; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yi-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yi; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yi-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yi; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="yiddish-datasets">Yiddish Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Yiddish language.</p>
+<h2 id="colossal-oscar-1-yi-2015-14">Colossal OSCAR 1 [yi; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_yi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yi; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yi-2016-40">Colossal OSCAR 1 [yi; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_yi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yi; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yi-2017-43">Colossal OSCAR 1 [yi; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_yi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yi; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yi-2018-47">Colossal OSCAR 1 [yi; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_yi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yi; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yi-2019-22">Colossal OSCAR 1 [yi; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_yi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yi; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yi-2020-24">Colossal OSCAR 1 [yi; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_yi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yi; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yi-2020-45">Colossal OSCAR 1 [yi; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_yi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yi; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yi-2021-49">Colossal OSCAR 1 [yi; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_yi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yi; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yi-2022-27">Colossal OSCAR 1 [yi; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_yi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yi; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yi-2022-49">Colossal OSCAR 1 [yi; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_yi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yi; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yi-2023-14">Colossal OSCAR 1 [yi; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_yi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yi; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yi-2023-23">Colossal OSCAR 1 [yi; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_yi</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yi; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_yo/index.html b/datasets/language_yo/index.html
new file mode 100644
index 0000000..1963269
--- /dev/null
+++ b/datasets/language_yo/index.html
@@ -0,0 +1,1197 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_yo/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Yoruba Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#yoruba-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Yoruba Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yo-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yo; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yo-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yo; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yo-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yo; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yo-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yo; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yo-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yo; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yo-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yo; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yo-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yo; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yo-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yo; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yo-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yo; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yo-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yo; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yo-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yo; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-yo-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [yo; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#wura-yoruba" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Yoruba]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="yoruba-datasets">Yoruba Datasets</h1>
+<p>There are in total 13 datasets with N/A tokens in Yoruba language.</p>
+<h2 id="colossal-oscar-1-yo-2015-14">Colossal OSCAR 1 [yo; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_yo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yo; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yo-2016-40">Colossal OSCAR 1 [yo; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_yo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yo; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yo-2017-43">Colossal OSCAR 1 [yo; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_yo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yo; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yo-2018-47">Colossal OSCAR 1 [yo; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_yo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yo; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yo-2019-22">Colossal OSCAR 1 [yo; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_yo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yo; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yo-2020-24">Colossal OSCAR 1 [yo; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_yo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yo; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yo-2020-45">Colossal OSCAR 1 [yo; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_yo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yo; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yo-2021-49">Colossal OSCAR 1 [yo; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_yo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yo; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yo-2022-27">Colossal OSCAR 1 [yo; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_yo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yo; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yo-2022-49">Colossal OSCAR 1 [yo; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_yo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yo; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yo-2023-14">Colossal OSCAR 1 [yo; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_yo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yo; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-yo-2023-23">Colossal OSCAR 1 [yo; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_yo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [yo; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="wura-yoruba">WURA [Yoruba]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_yo</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Yoruba]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_zh/index.html b/datasets/language_zh/index.html
new file mode 100644
index 0000000..b8d7020
--- /dev/null
+++ b/datasets/language_zh/index.html
@@ -0,0 +1,1149 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_zh/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Chinese Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#chinese-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Chinese Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-zh-2015-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [zh; 2015-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-zh-2016-40" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [zh; 2016-40]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-zh-2017-43" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [zh; 2017-43]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-zh-2018-47" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [zh; 2018-47]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-zh-2019-22" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [zh; 2019-22]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-zh-2020-24" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [zh; 2020-24]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-zh-2020-45" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [zh; 2020-45]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-zh-2021-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [zh; 2021-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-zh-2022-27" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [zh; 2022-27]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-zh-2022-49" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [zh; 2022-49]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-zh-2023-14" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [zh; 2023-14]
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#colossal-oscar-1-zh-2023-23" class="md-nav__link">
+    <span class="md-ellipsis">
+      Colossal OSCAR 1 [zh; 2023-23]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="chinese-datasets">Chinese Datasets</h1>
+<p>There are in total 12 datasets with N/A tokens in Chinese language.</p>
+<h2 id="colossal-oscar-1-zh-2015-14">Colossal OSCAR 1 [zh; 2015-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2015-14_zh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [zh; 2015-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-zh-2016-40">Colossal OSCAR 1 [zh; 2016-40]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2016-40_zh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [zh; 2016-40]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-zh-2017-43">Colossal OSCAR 1 [zh; 2017-43]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2017-43_zh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [zh; 2017-43]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-zh-2018-47">Colossal OSCAR 1 [zh; 2018-47]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2018-47_zh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [zh; 2018-47]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-zh-2019-22">Colossal OSCAR 1 [zh; 2019-22]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2019-22_zh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [zh; 2019-22]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-zh-2020-24">Colossal OSCAR 1 [zh; 2020-24]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-24_zh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [zh; 2020-24]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-zh-2020-45">Colossal OSCAR 1 [zh; 2020-45]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2020-45_zh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [zh; 2020-45]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-zh-2021-49">Colossal OSCAR 1 [zh; 2021-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2021-49_zh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [zh; 2021-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-zh-2022-27">Colossal OSCAR 1 [zh; 2022-27]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-27_zh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [zh; 2022-27]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-zh-2022-49">Colossal OSCAR 1 [zh; 2022-49]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2022-49_zh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [zh; 2022-49]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-zh-2023-14">Colossal OSCAR 1 [zh; 2023-14]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-14_zh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [zh; 2023-14]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<h2 id="colossal-oscar-1-zh-2023-23">Colossal OSCAR 1 [zh; 2023-23]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>colossal_oscar_2023-23_zh</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>Colossal OSCAR 1 [zh; 2023-23]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models.</td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>signin_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/language_zu/index.html b/datasets/language_zu/index.html
new file mode 100644
index 0000000..bedd7bc
--- /dev/null
+++ b/datasets/language_zu/index.html
@@ -0,0 +1,669 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/datasets/language_zu/">
+      
+      
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Zu Datasets - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#zu-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Zu Datasets
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#wura-zulu" class="md-nav__link">
+    <span class="md-ellipsis">
+      WURA [Zulu]
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="zu-datasets">Zu Datasets</h1>
+<p>There are in total 1 datasets with N/A tokens in Zu language.</p>
+<h2 id="wura-zulu">WURA [Zulu]</h2>
+<table>
+<thead>
+<tr>
+<th><strong>Dataset ID:</strong></th>
+<th><code>wura_zu</code></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>Title:</strong></td>
+<td>WURA [Zulu]</td>
+</tr>
+<tr>
+<td><strong>Description:</strong></td>
+<td>Wura is large-scale pretraining data for 20 languages popularly</td>
+</tr>
+<tr>
+<td>spoken in Africa.</td>
+<td></td>
+</tr>
+<tr>
+<td><strong>Availibility:</strong></td>
+<td><code>direct_download</code></td>
+</tr>
+<tr>
+<td><strong>Homepage:</strong></td>
+<td>[https://huggingface.co/datasets/castorini/wura]</td>
+</tr>
+<tr>
+<td><strong>License:</strong></td>
+<td>Apache License Version 2.0 (commercial use: True, sharealike: None)</td>
+</tr>
+<tr>
+<td><strong>Tokens:</strong></td>
+<td>N/A</td>
+</tr>
+</tbody>
+</table>
+<p><em>This page is automatically generated.</em></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/datasets/tokens_by_language.png b/datasets/tokens_by_language.png
new file mode 100644
index 0000000000000000000000000000000000000000..89cc40dfa0416a157f8199e4d318078c4b64e61c
GIT binary patch
literal 21391
zcmdVCcT|<<_BDzzhNw}Fv7u6;q5>jF7o^y6BfSbzq7>;MMFa#)Y={Vo6zNi=ZHn{`
zRzNnrh)7fEUFn@WAK>Jk-yLV%F~0GA_mAsHMhxuzmgim1T64`g*ZV+0PHNMJ-5coW
z=r&Q#pHZTt`#FM+Zq1orf5A_-o%Bn^e?)B0UbI!VG_<w9Y;8a%d)d~?%+l7(_{tyG
z4Xka9EiL%Dg}HgT{xGt&wXzZA;W7W`1KgI@S9zK@{CXb;S#Nb-!-kHI;WGJmO{`>$
zF&&-KJmt(ul^elBtq#Gm6CEW#rV9A=Sl>R5V{|$g#h&*dP``a6$0h0TL5;V<PZIxi
zu3aI3@4D}#8E>XHyY~SPgU2c5#pTb2TwGkID;5G3v^sQbMh0eX+z`v{33eKp3mvu|
zc9Q6^Ia18HA6I}sUxd<fMOXexC${$Nww1pI|GkggKmN#_+_G-vZ!yo^ep>$9EoFr>
zYnK1^ih~_XL;if(woP&6Z&n*D)#<;p6*D%O6*us%{4DiRgUOTkS8~G5IU<5f4f)+4
zP;BjfTmIcGvyPE4T^$`^wKLaDSXMs%zjbzZ_LYMiyPHK_`-P9%u#ejCt)DR~q{2`?
zZI^n<%5S+(Y_X$%sd;3SCs=+5b&f%SUnuCxm8a~H%fJ49Vw<8p9lrXoA^uKAknfhD
zFa5`|9;tu*oAcAri)z%ntE<Cs21oheCaW-;!}#wbXCG-<(mJ|wmX<C5ZGHZ^!gc>g
zmizz5hx_V$Sl#o%14T2l1g;$qH|*q2cyo8-Rdw!_)yobD;*9Y2_8woDYBz1owVR!2
z)ZM;kPmqE0*Ee@nGfY$k%|0o}1_~$^-keDl<XZWXHTOf_;`!!tDJdy|$L*8+xHKZ_
z>s8jRTNh{DRkq=`-&9`zz0SZx@zklm<${leOD;~Irr*q@kffC}Ud~??d$lTbbZW}J
zSTCo;K>yRWtxs<>SX)|BH|(rT{?ygZxuE5-(Ocxcy`t-Oi*}imln2uCSvjJcm(N`K
z%i`ipSC^e$nq+IWTkPY<|MKNjl|Frcvua22qos_C?hhW+#H&)%&6@a*Xy>XNw;#?A
zUkwv0U160o6~Q9PUJ|pto7!WGii$+-`lIARM2+jBWjEhnyL^IMbJ^k7l!cW~8T8n#
zY`-rO7K1+*x>|LY?;2_^(xp%+O;6}omcgRz>pqjA&lx7QQ4e+t7(LH4Z#HR6()Q-A
zVrYn#-IJ)DC)9juWjp`TV-xE(`}IC+eU_!}c5bbU$@!!1N%LRDU0q$*{rq!poxEg4
z=Hr#amd-G7Mi^FwCiXW_O@DlQ?>L;(lccGOXED^AHRAVnH7z(FQ#r#dc0)lnNT^_V
zHsb8V{`r})&G%Q+$K178x3VDNWbda>lk+4vIi>Gz*lsPbb>$2#k58XfNz}|px9*aD
z5fh`EF+A9mrkQCL@b~T8mLg26AG&|~Z2he@Yi(!8jl~zn>ojw1;}AeXmTd`rbum-}
znjc?rudlk!hv9_P>wb;vF7rD=J^OG+oI?1iKwiCj9=x;P-!ceZtz=S-RXF)@#}TCm
zzaL0Fet6{@)<-R+l5ey-PjPT@$^7{7L-hLKGY?(_kWW#O>-Fx{M-pU}?N-*=(-T9_
zdh_UUQW@~))uUF0Eg87}`xg0Ffz8?sD?cCZx#u4H_3i!k@lBVwcDS?aAO0sf`Tl2e
z`jVL)zRv^!pLLx6yH|qR!}*OpPv7@f&(Dh7OKIA+HR#N)<&<o70b#S9M`!+e58dJu
zl3`6$2Lr8Ml$^;Etu+5S#QOY={B^$ulkk<io&DlcXabW}^!PKQ7a_;{`WaU)cFpFU
ziU|7#_w9$%?MHO9v#m}F1s$O|K71eHh3(<DcIBzP62~(oQKp@Dz2U_Lf0byNT?;ck
zVVt^!`M>}E`!;6gMDGI^`iJvID*t2A17{ri>*LD&dBO^p7M&`>Bz21!A2Eq2ZP~IV
zP{KL)TSv#lXVc`?@gbJu*JbpIJroB%CaT6M9mp`M`Ig(vty`$|XpgWOHd1}O>fuD)
zqQ!V~ZmsO+Pxed28&(9yt0e}`P7K8GU#*-s_2kyhWoKtET^xBL#iDqBGjsa&q0d1s
zGAnDJU5+K~sSN4=loB~yFr}lhw4`3{kt3F4jUCpIYH&d|fUhgsa_Y<L+wN{|3UNv=
zRFD$}ET$@i+l|J1YfL_;$8}c(#b5{bH>m42SQah#8RV^5`->6H?}%f^BjNGN<0HML
zPxkjWrNz|N)h!9q-ZDrgV4+O<YG06h9WI=+m>6nSKYo2MyRT%mrqat-q&1mwYNp4s
z#r^sE@6pjwB`vLBXJ-Y4#}bI^pu$$<5t8P-1#V7T4K=0p1U8dIQ4u8Ei1=rtWSBO7
z40BzKYf8In+LrI&9i{8Kpc*7>`SCuhOYD~~e^QhJ_zdb3H24#B3N$MM1?Nh*)@|IO
z(h#pY<d3zc;v<QdvgEwPMoVQSg;v%|z6bg1_XE<s?;dRRw|r%yihVKO;U)RGaik?j
zxv;QMn6-Fz<XS{Tgzb2*qM-R_0oo{@Mqdf5tBQ$9Jex5R9_PiE&ygdpwnv*arN&=O
zi4d}Axlz}YUr^wCNIrDWZrPOuxJ4Ba*vEP7*u{3&rGj5JGqF<;E>92221dokvhUIx
zAFGlaK#)~^d>*iG|NaBoEUmULkjFWW9FbL2ROD1o3dWCB5sDgIeq7DIM=pJI*|YSH
zQI97CF_~`Lr*u|IimDiSu4(-I%Dss_M0!cqDX7nPuzB&~#fOp-DQq-lB_(gQWSt|_
zhIk3*+1@UHT?Dar)b#WXiobkFo>F&L`NaD5>l>MTyLx);Ta_1+wr$xGUgmqana9b&
z!Fc-n*La<R+_y~!J~YquD+0}I-XudyVRFb1P4^P(OIhAQ<jg){VPW(0^Mn3Jc08ON
z?~BCA^e-J%cj|m1C2-~4uN0*#S0eW9+b3XL%eI~CqJnvIrd_X4q8ch=WmpKWP1N7(
z#&Sj9Fh5#YPCr)ao3|_W_4Vc2_IYpE#%@(Hv6DwfB~;9zb7-i#Vkxrxr$Lm{+<~OL
zLE6^ee@{)|!9$S?lW=aB=;-K3^D%1d7#N^(s>XlfL20-o<;hgj`Qa(bLl9MNCU1*S
z5UOOYDy<+tfkjn<jg9T_{<qha8-GMOca|<LC;C^a8+MdCy}dehtbW@gR!N<r`O$}k
zi?ai!Nh5fQCJpf~u=FC<V@t6J0+gQZzBBCs{?0dVZewILY9B_t`0yUQU~Jo2x&<lt
zb@&`N9_Ytp-Q=;Js%BTg?}jg8l#lA`fBM9*bnBm6mOYndTz62=qGdXFV2`lHU^P!&
ztJ|wrCw#fJH4#P5eqHS@bBPF}NUa=eig$lwa`N=_G*?l#Jx(*!nx{T7&}iRV$iO1T
z&dM4#;Q8}PDt|SBNbD~U;NS4;ui3-1SKhmGh>FfeTMCPaXdqj8M?F0x-~a9X!^t;s
zh;@Jje>Sq4lM5yTrTw_#G&4;4nk|bOhFkNtv#?kcCF1Tm5ZC*qJnLUxe(Q7G-a<A^
zLi^;&lfvUPd&JW0P*ywJf~9Ayo7=B;z11fjMmux*N=S<VFx`i0wP8DF6uzxxzkrg#
zyj#$Aao%BKq+ODhT22eQIr;!6I6yI}iMWe0DrngzRuOVs-Jr}z(a0zkNy@J)ccgGG
z>7ZP2EU#WMySTXa>({S$3!2n#s?g4`>5;P={G{mZb4Z?L-7mkG^j3%W*S?_1O0EL0
zV;9ee&-O-Yrs_XH<&Bf}<uuTk!qYNpN;LqW_#oSEfs|nT<<(z%K4&`5O`5b9E=XFC
zU|PTRpwXK<>xF&w0JDTM#8)=mVhA?f(&9n}9$bcHyUyOddr8@dqS>$`r|T>(E~-B=
zTLlMK0Y>l$s$;@j;^UNKNEy^jGknArcjB*~Oqx<(AW2Mn*_>Ym8e=F*6#qkPVPX!c
z9=w29PuOPW=4e+^^e-J-Id?Y0{fDtKJ06}r>%Q#?+l%OEPOQbuXjfj>pDQ1*;!%@I
z8b_Mf5?pfwi^t+JeYqi4@iaj}zi-`|cHobd&$ZkJ=8$4_j_T~xpX&-aZqF$opn^s1
z|CksgWd1p7s@+xcKOj|aeLhlHhI#Wxmzja&BDI=`v)gy?=Cri5+@+W9IAOAhMeJDf
z-zzu$$5k7nGbD9d_f(n}<r~m;G$iZlwzc^?B_i4NjrY|xJ^k0pRam?Z-xq)8!SA1Q
z?Wk;hIA>#uUdecGq}S)hFiF?Y^|Y0Rd3^VDOgDMPmIV`jo0&zUdV7spou^tI=YNz8
z4cHI2#Eo{*u3lPAik&x+NVIcp6^C1Lwj;68v)nkrQ#c(l+WBEW#r@&KI#ibiJPfl<
zD^208;EM2l(%z!Bz0YxMt&Dsis@cJ`DhrO)_OZq5w9RUZNGJU8f4P-x^VH%H5Ey4|
z1Wf9x47p_A?wn;(9~*w_7Tx_f|3FNwGa6hwDBipWX}Lu_Q>dJClS6`r<%}AMzr9*n
z*Y7I0FKk}NnJ$0Tlq+ZL`fAS}xZvl*#tgLa_3Kwrt1q{r<-?MqqS&0VFa$*c1x1sr
zY<gp5y*rsUZ&KWS{CWZbj}NvUidT9e&8hJG7V_<L<bTs|?;aq5`b5ix#N(^j9Fz^4
zH-Frzd-F+a)9RzRjTl8OmycJBEVgTGL}*>gG)vL7sDAzz5Pnyw_UawU;&WYPz8CWB
zM+%2$0kcq?N?b%%!quiTY2|{x9ApaI9}gI-nQawLas-YMhk)>o+ImRN2vIgWO5>Sm
zv8igxYR@zeBG60x!RnRl7g0I0X&2ua;GjAH^6fiz7%mBl1yvl>^FV5d+I#xmd2b~p
zC6yd&1M&IqZ=3vgubflF6Z;pjF#&XEGSXJ?Is00q^I{CL;bEjW-^1#asYBG{!a#B3
zHCq0_p$#V2>^zz)4f#`>Jg9oh$6)%Ru*DasN;+O8PU*VAnlFD)8zSD^9V;`Lu(tET
zXK4y%W_$+A@zZ%COXbYjvuCvh_x0CEv8w+0=bxE()>`Rtd)*tYea+gYXl>?0?Pw8l
z*ht@B__$F&Lvy9e{)65++bvMQ*jr+zb8C}XCKoMHjhbsW5PkXWy;kdEbi#*3grpzz
zFyx7@q}j$!g)I{}0YUf#W_q!R1N4bdYLRfUHqkF%9!38X>ohgo6tI1DC7vM(q#Dq+
z{r-EzE`CFOjm7HI48|zs(iBP!@bACd{XehTbxH&1s90Hjw`-)QryoFN{T~ewr&5&L
z<Vd?~>%`pDh(9{Mx8bV~*agWScM-Auu$mDTU0PZ?l}BD$y0#_9CgaNc4T7d0x&9;1
z7<tAnF3g#>7di(DnFktH2FIavQM@aIMIy(?ZGrPHEX+>OK0aSLcxe+3KH6PD1+>3t
z+L+W({AiC|e?14qzvrF_0`k<UQ~D_vQw3K!6l<aT?y)kP_6VuGy|*dt@Wz#cv^@Kd
z-92#c+!+z;_>lKkYCdO}GH+M~wo^6;8Q{%;vGpro%IZY(LwS0zb$GRhv$84whgm|u
z)8DA;YB#;vqAPdh^H!{AZtMm=$|5h-WVO3cJ3BjFTP1jT6?L5_d_$dvMeIh}7S#Ax
zcKWMLE!9klRJFufbc=$vy$Una)4i|hSyf(0`>I5oSwAx~^MNt)&Yhn_j$J$DG&xv<
zI_>-9N%+UaO9LVHZ8@!8D_ew4<khC_ic>8%Ud3Ll$tQ3B(nsSdsz(f^-}rF_SHeNN
zjzV3tmO<<}uzxJB4@jWsNcido8*4$7e`s&lZOyY+pBirU4Gm2yTAaNAGQv9wSZn~-
z{UmEjwv}z=CdKZ&O?;A-mG#rwwc8mPpM^NA{P;_>g;A>Y>fhBA|6j?rf7qY1nL7Z9
z6-Ch}TyI-Fe#;$%>!Gll5r3_vH`EXU!~_C3e?&7~A@BMS0L6^i`d?R0Gll(`adXCc
zJBQLc7YFo2v-4wB4dJKhAMF;DJ|q{cU}R*JdH3h#51**)87lQf-XT!DiN|%S)t*x^
z;!d$A^I?jp{cv#M^tXqDl+^=geW`Y1*OT$1A?Yb16o>pD?&L|b>8Whh<Xbs)_I3nW
z?TZVhpeww&!~E>K%LBO1)}xVEefj6>|2$Bf15f_<zkgw1NkF<{2QWjrRJO8u>AV1X
z!**$5>VH-=G)d~=r|uG{{`b0#{Y_WHe6yyyf>6+&0*7m6S%f5>Tzzn9$cz-F0>=r`
z8&Lgra;muLmahUsRV3V1GBjIb^CLIDVk5M4nkiTAq66}I{Mf2N2=D?~qM+;6${krT
zpzbI~%P3yF_>5C6AyzF>^Fvh?PoB-n*<Mv3(<f@>C}C}DJ$CV@9$zI0T70mv5M>m^
zRmJcsp=G6qgJF->j1(<}wrZ@L;Y%_0@#0w6k`a)*Bkr8=VyywIzr06E1oj%PwAf=8
z;W_{;j!*?nV+>L*rt6h>nyp{imA8Vc04J_>6sKO~Y6@B1x)!8WMj>koSy=!&i^_13
zh?PU%3hJ`jLK4QKUHs=r&crRGJI`IGD9OlpkS$Z-G-dcdU<Dd+rRGfYiB|h|Ql@fm
z{D?q%e1HPb_c=xH{+qXN)BYpdbh|mu4P|u!NrJ-xcn7vJaK8P^rg40|)p3=l&fawY
z+!^!sLhTSyJMW^RB|%s5kv69hRa@KfBmZz^R<U?c?DC;vjoPeRw~}5!glY9D4wvAj
zQHs#$XLsi!OFEBL2s`$M-?!jc;Y8?K(5uy7;3<d#ebXnoG~=}y-2z3_bumBFqBWjH
z+>xD+Pl=~sTq*OHl|?&I*<G#QwQ09NHF{>j%Wr<3yVG_0-fvHM&#l|Ats1#sHS*k}
z{|9{foNkOC?lwjnF6uZQYbF##8VNMy+u0TVIQ~y!L#!Op%2ucnp9%(Mi@CRHk4}aJ
zKmjsxU45LgrN|$2`t*cIn&{V^u@x2ydQxGi#k~CJl0M)`&QJd+3pt(@>N49${RsRU
zZ`oe>In`j}*=@Iu|J`7s`1Os0L55?FXqoxdGy`@ap#VOzOs|;nUGmZ0{DvMkZ{9=>
zG@Tm-P%4-or6JM#qtXPJ?_KjnO^any*9-G4Pwn<flr>zl*rQu?qQT^-j2iWQ_`b5O
z=N|E1`3=XjI)Y6OV8wpmz*%%PK3-m4fr6&d=&&aSo0JhGq`avjD-Bj|pv&q!b<^it
zC9Aca$8ql%kKK8`Tl7K|>M1^UI8t^2Q)t3X^d8c(vJKVYr|rJJS$FsT{Q<OYfs(F;
zEO?NE<-gKpSMW(aO8m@6;k6JoDR1Xdcr>ud-ss&k+;R9Dhpw*fvmtB-pZK<w^o5Xv
z$}(|Buo-`Pd6^XQJ(l_5`_8%>p@)eFw7qVymhPB_%uS!lVm_0>5fF0u2|9sk&Z&#J
zUqW%1){Z@~{-(!V91r10-z_a--F8d5%-b~ro_19PsiTe)$dH<_2gx__#ag;An@1b(
zvNJ!7i6u{o$qtqB#*OJB3)0t`e@_44(sU)|ABANYWpiO}s-2Y0Kj`)Nrd)g0&$)dV
zUPJe)DCfoUzT}qg4wctYqf#SVuI~~G`gY|hQ}Fk;g{dczb6|sScwXW}qKZhr)^QFS
zM%vqUq+lxEWnmUvX1Sjgy3I;^2R?ghw3%*P?8Y7~=AO*;<45Ad1wS8~r#-}B+ZG)P
zBDmkRhTh`d(R=CqX%mS`{V9BnRYdCy*mnwPM$LpS`E{vLRne{0q20A0L~W@^1b1-V
z*Xg<>&udZ-bN?0d_F9iN3CvdpNM2yN_usvHHx9|Czb5iNu=F7cYS+s*Z{BPJ_5##L
zeHTBTw%Q@lqb(#XFaU}GctS23t7Gq8(ou78aG+0J|8FH_kSUON4Eqfz93UE-DMZYn
z_NxNc8Go!Kk-q;s@cN>G$BKWbdM($5f-dvf)7_HF`?}8rJq2Nz<`)(kq9|LPM?V}Q
zr4Cgt;pWUJ1>%JFHWrp7JmT0(Sr%DG1aOv?`^zyl(`jI5^wP!C`*Iowx6lLZsxK)P
z*>3%xGpNxr0TF&jF2%bpEq>0j3<Kd39l(DTEZG(BC~%piThB0QP%3g+u()J|BW5#T
zfAdgPN6ea&w(jK42D_{Lrcph)A$~8l;S^BR_lv?>XLgMhG0^fI#`;0keZ&^cSbgBN
zT{+=I0sxb!!8I(IPC$KA&%3)DMVWT?^fTJC$!Y&gm%Z_Rw_6#giql{lkLE2i+0%QL
zfAbV73}Y85{m0b1;~v(zN<81X)0@Nz$JZeFwWhyZ>)%J0or6!^`)yYQnuwO_%$0&@
z$4>D4tpqu(^Gf<!5u_Ecx?mCS7Jn%#N!p*=IcT-2yb5Vm*+OKA46~+DGza(+zP86^
zX&1xF->sznoAzjN!*h9SRQ`h~g#|T4$ah;F{PQ%`{z-@yD?xILslp^9R^jXXsP))B
z&bNl!b_Si_weAn8$YqxNaim7#4hM}Rv-)!mA%%~-A^W{=EtKfjjC4y-BRP}l(Q-R`
z>(Vjjy|RbCwWi@~_OFw>S4;bU6-58RX#bxHes7u^RxTjxzw}?z^efuzmtw36sKgQ8
zvnpn0`wX^O(XZSzov@$CZj-spjk#8PTbFaVu4O}P7vu>9SIg^_rsmaJt?b81+1bBA
z==}2aS;Fd~%e{76ZWeAy>G9dWa#(GESRQ_8vGh3PW4Fz&a#cY7elcJjt!Cm!ZfCKL
zQ5Q$&i(0Qa5rKd_^oy=v=ToyCwiuY5<DKvSW-hhT7kt0r|BcV*W}CCC>UXt^i90kB
zcD#4C(d_G~BeD0L`&;Cxiji*hogueY35I#<_?WS@<Ws1<rC+BT#7HC{XxHsu4Ks_s
z4#{+L8&usH{<l`WIae&eNkSuG*tIrbSy{2YV6bg9mE5|GylSC3A73EtMX|OEZZ2+?
z44dDrp4+v)mE+$>RCJW_uC3CsMWG6v^V+jqbx}M2`d_9zcIbIrv-?&4vd5E(>qD(W
zT?VSf@7X^1eeG0{)1UBG|LbO1=}QsY3ECXH+`+fvQ%~O0F6gY=&|14lp@<!N`XsdC
z!!Xkh9?Pyu-TH9nPJCVIn@E$Han|0Zkk=*?iW2lwUbYUb+A^Xs9=}bsNa^TEa6TUS
zI4N&7D)J)dhfQfof8-knX4gj{7vvNI_qck_Tr5x#uzY>VaWj6Rk{dL<-;(O@-d*8p
zI(LXeqfe+LDQxSFONpF;Jjrj9Q^r1UNat0iL>>R0qN&1WIgGEpKM-@cRW|HYn_8TL
z&d8<Dk{>9$tULUlC8%*&O7Ps*)r)(ui*I;l#}P{E4NA`F@l4mKr6q~$d$n<U?f1EK
zb~wzmeKzIlxZ>_{yRm9LmyVjl_Uy^IbRu(I$w_Hm6}9MY@mu|$wGI65?ot1v#o3}i
zeetWUt+oB}?d-BGAp0OvfbfC+`}flff-1Tr*h(LOwasu@a6EA65ZFqEbJ|kf1MUmA
zgM9t`qQR*dp?`Px^0K$XF1tkq7|7l$6eNp(baZsQqd-i&fA_A`q|3ZhGl}b8>{}Om
zz+b9>nL*j$L<8}FuYWhl1xubSpClXo0|Wc}f3;_0Wn~o~E!|H*(heS-gzW5N>QcEK
zKX%`aTwE*>(elHeE<V@vd3%)9B+pw+{bf_(!Q&3a!sqp`6)>pCFVa!u<*Tek4i&sg
zI}NfD<fuW0=Fp2ew=MhA^*{X1Ea}qnzHeA&*REY?CrHr`Ja#RD@OI$M`sYT9Vmxy1
z;nEA17UwoFFsPq8_0wF>27H>p$t!w#;E}kA`lx&J$GVvCg^F0k$niHDc}NT6qqy{~
zXlWrCnwawY;^JJ<GG^0>{svK(`C~6=+xfg6@-H0|cAm8bX_Uj~O*7EfH!AUDPHi$9
z{qf@o8<8Eorz5+H7=<l@p%HU^NGr0@rnMK{|7sF2aymvn%<uh`54=@-+qVDd|K!PK
z*L}&5#AyCJjkq##$9h_VMp}K0JZr&^;yue;v_rlB8+5Vp=%h#+=-h-}bvp!5@J0iF
z(X&`x&OKh|n;cnPt$>TX;5<8CgDy{OF<Cpnb+7izKmP*Qk=kX)44q2}QdtJpj?j-j
zit3+I4IYA#iGgeg0YllOB|F7s`YZjMX=R=E4AZV-R?l^A{x}N(@F`!v=r6x9J_qpV
z?X8K-FHF({b|u``#{><V2KJruJ~<2tYSe{|qi7h2=b+W5N<7J<+Zq&sV^4_vIAm9f
zw~j;CcIWS}=~;BETiwv>7{gfM%|5hP+1z{yZUS^DPXQ`xh~N*V#1s^Ms%D}{gvYml
z%@ltI9e##RkXOlBZH^a|W8@m{tlvsx+J)&3FBW?`KH!KW8mVWB8O_`BH9$_*1K}TV
zIJb+0cw2!JjYhj5FS(G;W(;|?zb?iLV(|8D+t|R6XSFj3cZ?)Q;_gbTs^lS9)`K5t
zC~&eTJSt0072ndSK!-1dMm76<Ih4<BJt*_sBllGIBBV}tlaC2;=p7Z$Y3~I$iG@_V
zaU=V4ra2W{g)!-co*wd|M&`6+oV$Xi?jo8vZ^QCu;UHA;m9LM4+o~b$MuY6`f8no#
z?$Xusn3a9>>)Z4L{<<zTL6(JaJC11Hm>=L06jTLYP<zR;;6UH#PW`^ot3}|za6++B
z5uazzVxh`vkkg~5ffB8G_s*SOkZ#pPmxWrTE|srm;1WFr2?>#!wp6oc>G*{b#z@f9
z#DqfZ7+S&om8z-w6k;Q}P0vutYv4m1O_h%wzRyBzHUhsCc=Yn$08&S=W`x%z*8{T!
z;cs^yvh$Q&npY-N@9~lRNcc&JIz9X4HiLlc;!W(3So@K-NuImIjvGFHczVbV9ulHw
z&5ZS^gSBu6N7F!6KMb8UMlPgQe5$1*(`)G*zQ&FpfdWezd6--{D_yj-m`BuFNWC<~
z4D`6=w)yRShGu^U_rp>T-oJm(0ix!wzy2a>5^%GbWUM7TDUcRAJ|<ILucS3Wtu$}f
zB>@ESoooh*QkF%l%F91bdbLTSr@$6b29ro~O(Y-x{RpRiUGL<Zk4RH|wKBNo{llFE
zI+C4+i*HOfu4AKANz5gH8}EFRo@5WQVhFM|hzT{Yp$8~6H8q-rH*?tH0N{x`X5-e}
zHh}ST(?;$`%*WM0z4$hms6vIG9Bhgda~O@43E(3=165B{6+WfWo~mSghR9L<?L|dV
zG|4wlBwZJ|O4fwFVT^>{N%1BjNQlL!@)9)&oC&Bi{fl!W{%l?@u8W5Ps{GOUazcY4
z3{>dNQOeF3>w@yCbfQ9x*GRd0divSsFcC!yELZ2ZJ4tq3S6!lUiJV|Fh^Gk>SS&Km
z0FGG{sp}LvCi|<#Co>_mn{#`f^PcnP)~-#`&LbVWYO;>DgoFg47bb_AV-AJhIMqe-
zud-e^K0$G&f_!-K?j2E4V7WN!O&lGh8Apkk?R+ZF_F;$?>olAr1;L+$%{XiieLZ2W
zWIqv=1dSU>7if_GzUl0&&D>z1q^#VKVX6)guLR{<6|O8c3Oal5<m6<Q8(-EqI5`~#
zZ35mx8T{fzy|TQ6%K8HfBP|oKbC5=haEC;1#F`xz6VpN+`!UT&<hr)DHZ=RHDk?ru
z^<fSYlnoIjgbE3EHV2DczB2)O9*X!iq0Mne&d__x!X%3zjPH|_l+0Q*n>`eeoT9Gl
ztO{{O?Zk;&+&cNO*yn1X9P9Y2#0XW65W9f3g@g_ydadBkg5*O8075iMd9gG=1e<`T
z!>G)MgT%wPZ{N24>tD}*TDO7y*s)Iy*R&7fa|o{7o=_)95*e4@{sMwb?!kixWu>JG
zxLN_w(q4`%kQ9tcAMYik9ij6{t*U;0(wlqf{H+-0%nSYxeNaZI2rqB$ozLXOCwwZy
zB$H7#AsAl7Cdu%s%pP56s*@;5N_U#F5TE{Xdy=+h%^JrpUv-ky;}D$j$}yfB>vD7q
z*OQ$AhmaO1J4LjU>4p`%K~kN^t!F|m=<k2V{d6XPb=NMU%rx;x&U8M7aq2dMMD?q`
z)(Yws{{~t9tEafoXuiD+VQ$2oCRMOE$C2S_CK+W{Qa(xL)F)n2CozWJotW0ZmG>uR
zwJ8y~2--><B7svY`z4jN*!(F(@WhKLr>5GRl5ta(OI9$#rR&`Ix(+Uj!^Fja$nN!H
zjVNX$s$WNmmpa0SxC8FqyBClAO(bpov?hV;gBKCTJhU}#F&}&n`y-b`LLyXm80#ic
zq4NC1nmG<WLcAfcmTi4H1&)pbiCIMNCXd<0#f7xtRMt1<c3rw=L?p*$l1r7oaG?%d
zF*&E~v!f?B?%;aquZXC?lbnoIKH<STIyw1~WE{l#0_=$Hj~+EZR*fN$4aEiiozYKx
z{bk<6D9VI&Yjc@1$Ib23fmr9kOHR6Q;hME|Djy%8Sbu>r7HJmdMk47D`vR_Ow}^F=
z#NxCbv7`YwC`wDalj?iHpS$nwP8|g>QrgVTaR=o?6X0qj>JG%2dIa_hpr<lqU@$yN
zz<~sf;F@}oQCYOl`10uLKq@XGmXo}Z!b@Zm0DO?8F~WKf3{yZi`7^K(@-z$+VQhVA
zRT6|GM{!9^PZxl>PaYN0*WceCHakK?f~JTCUqSBiA?xd*t^fMh9~5Gzn*b4Oj0e)+
zR*)y{!yf*lqvR?_WE9Qy9Su)P>ZXe}4PO^@95*7aCd8rwHvWNS8`Eh9@hGw<RjFzd
zrCYAoE^R3L{@n^5KvMt&&P$mIY;j1hvm>rcRFDj%zz>m9=L+VA^8h9rokj{`q*xr@
zx#YLTD()wn3Jf+_kH@6ABXnAypt19zmzI-LgU5z2=j_P9@#@K;<c1JHr|xd(!)l2-
zO(ujoPijE!mLdTNPgRENk_)jZ9FX?08rZprC%YZ{9?nd%NOf>1SxP<%hpOrL<NZ!w
z6n+G$9P}T1niBWe?GczlRPoeMy#xVJRpV7CFd`^IsbZr5KKinGaYgb;vcG-vraoC$
zvS6k=h_5&nD-f@l5ic?QWi6?4X@(U_NK}F@^EVLo^1!Pfks?*|Z(N&u{VcC+UZJJP
zwWuJyoXMI6vSg^Z6ESoI0t@Zny7(L!TL&rkDO9FFVawsNUb&|lADhgCU?tHd^%A$E
znsAYqRGtkRH!ADvM?uG-dU*UsOsa@{7#$|v>Zpr4wS+vpcJ11vw?#|#HQ+$abfYJL
znAr4e@Crh{(}v(O^{nR!8?%V@8K4LC_wL&Xs3%_>XiQc|$PI=y!7rkU?tvI{A;RSe
zy?hEojyyCuMHDV%7i|*N@T0kr^a~eStm8z#w?5Af6keVWY3I!KLq9t7)gB;K24UGq
z%x{QL;&nhMr&-QHDR3xH?nMZEk~&UYZzS?{oQ4E<>4B=Va4jg9{A@}}4P=O}j*dt$
zW-z1N)Xcw;a%rbx0E^Qgulc8!CrOGVf$qtCyv#7o7^arpeEM!=;}H^ktnxS^ib3{k
zGdJ}S%TnHO?uE;jFK^hisR3Jziu;a-B;+`lQZkN8N51da_rf2>r(VSo$pAi9OYCf1
zLE6nKVuisyz5vD}4-Oed?fm&tW{p(+tRl%`)NLY+5gML67yzwiuRIt|;)&4@9tM(A
zx~syH0e7-FXD4>uh=O@42ol9Jpo~CZ`T!rFZF(NOh&Vo6ln@+9hVZK$6clV=)RTc~
zvrFP;w!>^+Oa&l?_O6A@Ul|yLQJ?uO3umI63Y@YiGBN@--Q~RaVt2t*8$Yy6jZE;{
z*&;UG9;R<ZYtY@5ho8P*4h^n+tGw7{fNfLP7BrEDu-Qbzreyf))|zgZ1-Rt~w2Y07
zCzs^p<b(+HhYgDNuv)@V*wzFPqx|F)Q0X#?3gJ>SzsJC80lK%mCQ?d!duv>GWr!(^
zO@^>Bn_nMkGxy+~obrX?s}+)6tJqjM|3Oa9GDf{}2)1UMStSNj^OGJ22b&3*efsm>
zW@eL2%XSN0mOXn+;odU9VPybgc#EH5BLx9QwR7nfwy0hUIVdjPie))>?p_~A>Asen
zH0(&ray$fCWL4So=iMCO_4ME^hGC+tK1sXTcJNb8I!r!di_;w%I+4TN`}e=K9d2pL
zAFsVo{EQZ+2+sc~_<LdCfMP~HG~Q;eE+LYxZ3Y_Jmm$C&1a^R;!ZSJ5Vfw(`oi9?#
zO9=ZRtLTt}ySk3UdolR1y8B|ngdnu>AzdCNaVdgK;5SqmDsD>F2<tttr7E&~YOfx5
z(Gmoz&oUvRc(~<o29={XFW2)DHw4dYNmlLp;HS%lGu?Z}P<Dc_M}taB&#Rxce}G&R
zP#JPO2;E}fkxQA3dLA0s-VW#uWyiWJDoB<ii|?|uV2{?K5XyJ}p7-Z4Nn|4-NV~ku
zqP9n2Uyn6zObXE`aLkC)Eh^k~^v_!*&i58tVK$M)4U`u;&n12C#ts$4qmD*eHiY<v
zWr6l(^CtI|Z`dXDK7#N~2oy7pA)^EUMh8K^uAN8UH`!sT=Mf-i>PH?+Uu#~L2rSuk
z{O!NGvoqLq1${;rgK#&OLBCJ|tL`ISe+IdBhaToJl%*h8Kmu^9Y2ZirjjI1pE^xH7
z`SNOw`q?7$B(Z7I2gGUTWs(LGi<=!F?vz>J0ioE&PR0L2Y3WfIp*sX=v8YGN$WU9E
znTNjlRN8diZ+*P?v<xI;8Q90mft!pF?goUVNP{TK4=LUTFfRzMxMu)$K}gMkFbXzF
zs))ZPJBF+(^kE~3rFomV(ekMX63!&51T5QHf{$H$bCyH9#4s%YWiJ4$A4oR;h3xX%
zo}WHl97E6qL0E1|=Ev^j()Da<s_?O`i;)*b9uZ0|TFhOT>Ef9Cz^Hdn=GmX<<n_P|
z=<$RDK_h&bM6BPDFy+$9u1;k5n0yr}AsQf8V{`4J53}zLVf2f}qae{4*%tYT7iynK
zG&G}bI5E;#2YzFfXn|FxDM2fzwy+h6rW|Hu8RTDa3sy9u@9x~W^9;U5o8cCbxl!2Q
z_^mt7*F8VEmLEW8=fQj%Evm=omzQrlAS0GPKY6<xyHGp0dM4@SVlW_P`~-4aMqhJQ
zavY4q>o;uRgGD*B$TcsoydwnJ%%-<`ZyMS)WHpjGFf71_DkX&SQDz+i(uE)Bg5T=P
ziMs4-9j3!|!Q9A2v}<12w2ZuO1k65FqlIY52bt3iXI(dV76>px#3XcZ-@Z3&2Mz>m
z+{wNCywHrd)T~{{9wAohORq(?Cb~!=+$PCX(bxlqh&Al;P0mOrpbR|YfG0>&BbNha
z#9np0xm%N7cmFw$QtR);D6NdvoFD()%E~%Ymdy=+hy`@nLr!R6qQktszwxsAxe8mf
z93xSdh7Mxuu3g41GhI9DBBdBe%#I-y1j30L2rA*z_+V4ra;{mJY-SPuT=I+-$&WDw
zBjUS8NwU-p)z~pV*=*VUql9%Td&{o@K!o~^<Gq*v{`>E8D3XITeuhfq3lseRApO;V
zmID22QBirLHkD5o1I;{mF&J|cP-J<psweAA_LKlHRDea01tJkk9jstFg8eBxms;e1
z&GcgeFR(Ws2|TGnDFy_{Z@UFb7e9+Sj7E{I<uH`dC<Ek+oD1ThY*Awp$Rw99sm{DL
zw*lCv{mhv&pZr1Y1;U*73?LyO?P^sZ06cN{2LVRr9qkM0DC6TCo4d<fym)hzt<h<5
zuqCHXBJfZhvQ|0hLQ;l;nd{;<GaY+u;0nj`Gvug1ka$7QpZ`$@lRr=^6Is@=cU$G3
zp--y>?Z(;-%TRZf<dOhREP!Ok4@i;YF#Yv@tHm7*jq#IPBL7=~vtaAwPkg*|kd5tK
zd7vPTBw7`!T7~UMTPu^;jp|?t=g(s3Y`a0<2vv!Vd5}~^zB@9V#)B}be#WaO1D2*7
zySW^moO0pzn|c6H`bZnTU9x*6B-#c)XVA#jH-C6<fcYkxzE6nAU3%QE0Rdq5;xy8t
zQAw^DX!+B)B6ka0n4y%V&Z}0}y(=jR0#^Afzp2az9Hx%TjDd2Rp==sjOuh1@!f`Fr
z-jtYQ$Zh13jO$}-kj`91wTrhhfwr~nt$tk}tN1nnV?nfBtVIiUMall6KiZ8bQ&YDU
z<e4V0*+`lv@c=x>2jp>3OsoYP=n$=MQ>YOjVK>@6!6L+>>VV7q4=JQ2nKa9Gi8*7B
z(9Uym1M@J5l0sU6EGYKPHos^N{Dmww?=pC}yK!eZn9mAmmljL4VrV_LqE^pNjaZUC
z^~qeXCT`9U&7~pO72_qop$AD*^KsA-NqWF8q@^yjOIhRwAw4Yb6{^~ixv>ajv}m+W
z8gfOJnBO27<$;^qQDhS#ls;i}%7QTPMx(Pq4Z?uJGlPD5d+X`%m+KJLW8gpnSzV@I
z%LEBkB<K`;E?h+N<0oGvUW|opU*NC_0FfDls@CoH<o-5A77|M2aE8dYlLnx+m$M75
zAQ~O0Mz@Fc)$+Kn&p<Y$IgA+sWCc;XJ-@RdJw@YM2uzwZ2-yEGj5Nr?DJ6^jR1VWv
z1=dDu`<SQq%+6X;3nLk%lMkgOP%&|!0mo_+!2b-wSQd>GA?w64OR<SXKs)lG3lS!`
z0l2)1mp&@14v}pPgT!8xi7>9aS_J)x+Qu5gzO8C=($U-Xc;XoFmmo%3#y<Tm%*R)Y
zw&p#7IAU{yY2EPC2_Tgng`?JxKo<<#kUB!5H8Pqbd&`q2Pr5NWQP?46*dn(yKEHnB
z#=LH~tM)sQ#B%LNEC|;%hNd|P>Hez7rXvRr7MBN$)Crh>t}n+GYUSUkxpV)15O|g#
z6xz8T*D<Of1Kip&dHl2Q{*6oKNezc}^+dXy^4~M+X^CGJ4L-!dK|$B93J=&hFBWrM
z3y&J?`1(A1OJ8=yQ}7Kb;op|UoxhS6A2+W-Ox@=^nc@&gAlI*rPE0faUM{zzxQfLK
zGTB`F_n(f7izD-C)W~cZN5@<OyU}S3z_3pykp>3{&-LQMPBKe?NAnSTc)zZ$?#}1}
z8|)>mrI{*MGDroEsv**qWkD0(3qs5Q7-<tXE^*pIdm&RiIo4gHZo2n!{7@pvR1``^
zBLKxXJf$2$LTU&Q(zd}OC2n1$1q-M<G^Nh2t^+(gJXwo&m3xR+8^K2`weUOmfoB7}
z3&&F@^chN2l4mcU>*7odib5=0>*4@0=ub!&f91**0lR(`Y*tIcD1z=C?vLdrpal@$
ziF70+OfZFY$~&D9Ss3mB%231wYXy&1`uO=BgqoyYVio7=xF`dng!~fVNh`QE$CFhu
z34o;!CJZQ8ji@s!nPwVuJklk_#U}6Ewum~<rmH1s1$5<-aS_s@Vn3RoMb3pTq&@aW
z@Rv_1dc<)}tX9}!aVXB-+^!5iz`P)J)TdpQZ?$XG4g^06?DR~}#r#R6knrS`XAR?i
z%<rIMB%U}VRUc=VQWc?ysl=;roIQ7r&{B{Q{DaZ4g*x_gFpE2?pcKWkICO3ygd%uO
zGNXxpa2pI`ziisw>9#YC-vNJY*B_vOgCv|rL}cUy^40+Ou}Qn0sxWnsiG}l{5PAAy
zd=%8#HBzIXS4AS9a~wTd-EeLA7$_AYxTwU)F=I4=3J^u8KhhV&jOA$7^gAPCEC3=I
zks(bU;n48QGMGrwPNL0*ik0N)P*Mt}g_zE9c}k}iig0<v>`hjHOx8Jrdw|x3pF9-H
zz2hB)ZqNw)HHl)#Duh%gL;?`ZM}Rf*<h{7V+2TL|gA)kVab2);dSHe55<CexkWd;f
zS6DP096`>Gjg>Sus^dr8a=kYJhw1^TRG>*{=igAoAlqZMxgT;Mfg6x7$!wtO{Ezz!
zj;;$6{?{>LfSCgrv-0e)da&Wv;<1vUUD7)f@E1jEKaB=>;?-J)UNj?(VAbIFJV`0z
zZNIc@OII`~GQ_$;ky%<3EXnwX(4EA=5Bn;iCsosp6e+c2_hMSAV6^NorbZM}uDsXU
z-X2Fn2i?<I2uLcZ=tDGge@XHZGw+7lzgU@)X$`V%;0bj6_TUilLzy7olJR0<fdxUE
zE&7f5LDA#IG_*xTo+2?zR!PvP`c=b|;a?%5k!OXSEr4bM^qq|NpzFzXi?Z0G$PRE>
zc7j=e-jyd>%dAI)8Vmr!Fna*I219CUIGZw>V{PUk&oN}oe$3X?1E#dPocSe{wJL1s
z1JimJLO})}Q<N9`amTq8ilLxE*)}XE6>&1zg(e)K!J_)b!1Fgr^x@|`7{g0goMdX3
z&YkOe*YSrtyRfandO@d2GiHgK7r{ycaQNZTM}yKZ1uaAN7g1)R-NagbeKU)Y&bV5|
zg5zUaY}oooq!st(LaW07QqK{az2CpSm6n%R$CD-`mY2lKSBV`0#io%_Yb{t&+3ivF
zgNXYRD{4IWDTVN~TFmQ3eys|lUYlKbL|Q7cd>F>dd;f#stj`|ML183B$UfobmP4KP
zWh0pfwoG>dh=~%z%gc+{=TwP(OkC}d9CC%SA6k9Cd+7;CHps92sEuTDk_bKh=dISR
z&aVl=<%~29FXAwxk1d_nd&9IoU{*px!^Ne|OfT|VRhVg>wP!iHQZgGzI9w>acw!wX
zHu{Sp$C;ND&g8^F2#_HoRBg^v+Nz0~s!*a(_{e^NZWv4a^MqK!?y{R4)Hrb9K-`TV
z-<EZv$hCAyuZ%q!&r#^<Zq2#5yVnvwAzA`5Fa(+E>3C6Hl~q6<6FJR;qAE|;-Mizl
z<;kF^M%jyUMFm_w(G_1ne8}F1nXD`iZ_JoaT>~;XqLY6SHTbc=zq;qlx?@*u?su>j
zXXX3_!ftM(z`=v(ylp{_K|1*$@!%2}#*D)hEYSnVY6EyQ{`?EwAA35-Zy$wKQ4;>?
zI^bsPGvBV`+^N<$n0qYKaM^<o=)n`vN)hFZOxS}nZF3wrLh*Oqz>fl1+ZFbD5<Hcr
zW+2Ua+4N#{M1Aomu@;82Dx3Xhej38#MQ{276A@%siD*Qm1_3-8V_;^|yyM$9;u>xR
zYa8shGO-@)u2Ix-0@9<)!h||ubsfPy44s~UED!;Hk0{BcUJU0BaAUEJ!N!mQ7>qIu
zeCu5H`Mi31bYIyfrejtowsWeKDxKP;rviqQ(40U!jZiQnp$w5}H!Smp4I8YlnJj<p
zi!tLC-?tz{$z%YeFAeEf88RXJ=bz6{ojyI(d?|6<YvuSeMr_u{P+`zZpqeaC(}VcD
zvUK+^KoH{(-Ur;>-BXiSEI?N4umh`8^rQ~3v5_H;&j4x84v-H?tB)~WBC2_>dZlg+
zTrBtPxwf~23?5^S0~|5OI(m8<5-(D0^~ywHW$=oJ@l_XlL`x~^pa3=^E(7L7El%ZV
z4`PbPqE$82EMF`gKJ{Oc(eKN@yv6p9A#MFr-&N1j|3Ty0mt4omLBiGNh!&eSVuk>-
z)$hrxK&+gI@#>4`I6x)`JNtQ}9#bQ4B%tvkW7K4H+PMimoB(J7JdQU8Ib@<AJs_EP
zB16Pzc9L>hK{#&Pw#~M}wf*Lpte-ik<2d}j57@i%2Br>faNz|cxY1W3eUR%^(5(;v
zFqeu~F<`1p1=(5!(~DZ(0~pKuNG3}#-w2}N%2KW~SR56|WLz{dS>81YHZvUh9x1fY
zdR?y<!|biT>_mK}N>n*e9K;in<Z11TW^Nf!q6UM2OaKS7*fEKHtOSB<>DnHI4Z53w
zhdhM}I6fJ%;m7pDv}<r6uGE@nITg=%PZKqAzn549k~NuDXaGm*V~|&_oGvl{{WRg@
ziP%GCPhvtHrOB9B|MjA|;p3$KAZQeE^~(YutArSh<4>6Gl`%#RJbb-kj!fqb(qIWn
zCLWasheAn?0dXq+h>`aciivvb6amgj+Q*3$OiqKXOCAGWoS~gDgn71b<dHryQ3Luz
z5abF`zceyMpe)Z$4pG5hlQGeFkbXp+27xl)QP_$qj|#5}E@Lhg>9kxzGZr&PlXWdg
zE%f*ZDb1w(LqENULaU0W>(@M7`{E&#g>djdy#ZH4Ye1@zcWEH+O|$YFpvFJ;@~YZo
z9)zZe0-U~-Eep@F-DoET5Pc-tVh_xp$IfA5<`&ZcC}*PdlcS;UJdDvHkj>S22C<M)
zs2G#d$XuEoNG6&MVfnA55apBXz>ModnocPx0KUj43P>l_P=>a%vL+Le16u3mo)(N>
zqWY5sBVbhY#`otC=%W#0#-Pq7YlOl5H;^+CSpZ}P-tSO{`0N@PacwYPoGMysfFr0r
z-B_8ty#lCH5d;#MX-dRY2+2Ok+(Z{8c72S>kSYh@NhD?DlhVe<soVg7EOlJjdBCPD
z9+a_rBvIn4v0!Re#4;3P?<kElLlX38FICYS5?zy25VG+=MF+nSBM_YoI-nPghkiPX
z3BU2;J(i`Iizf{I+qqdyq(Oo`h)p0#OJgnlW-`>K+ARM4)mrZ;lr=I+(G&>#sn$7S
z+_dI`<h}sT6_~}}Spy0#xz8gPU!H(Zr!;i|JQ<mP#Y7gx8&W!%fj2@RID(2fK!L!m
z3Per(X{gQ@yvZ0hw|1^Rhk->g)HW(~4SfwYph%)(BRdj<)d31<9Un9`gk0w-1Ybbj
z4PlTo3__SFtD6{)o&e{P%41m{`-C`wkSsvd=@7-5yaNVvd}J_fa!AK{qQSL-Y1b}?
zhK@~mQw)xs@#pKG0B+UMMWiM_M0XG@VzZ5jDFI9M!K6U!dm+%6Bc?|t2#Sk~2~D+h
zg^^|xtArLvlzg^0gmr&)_-(9iG^n!nakm)L6wO35YiiIth>VENkGSB8<%bBHz;Il&
z$$<StytSoGZK9(~dZau?M|Z-r3s{&Z97H*M2R;}pBQv^W=E!6YH`{c9oih0oJ6@&Y
zO<u%7j6WD8Nby(jRsjb@X6wHO6VuL;Jpd|tKhj}RywMIaP?Vr`5V}VE$Gfc>=X8-4
zq(S2w`88j{)HO~=mh;&&#<?Q27NpUfRPI6>@<-ojayh(<?JC{b+1Xrk{q1J0xvCI%
zh#Xab1_T?9co<O9N>dBa#1dYzC>Tu-5$e!4WsN+&WC!5gPf|1@lMGZ^wC47K5+}ou
zz@@eo1Jg4z+G9QxI=aW=jLV^wj#brSbsm4>$s|Hh!Q80nLPRra`_nFK!y3Gqrv}l*
zMnR*J-=B+~BiAs;Za*<<0AUen87Hnv<+e$RhDJ7kUQoj&e7J0`kaQ7vClL6SBnQ{F
zHh+d67|5Hz0Gugkj@f={vI?kI7`O<4S-PRAb5O5*lek2m>m2u(ZGjA$VkMvjxlpkH
z0|zu9K0LaGTr^_K>Kk=SE?SD$zRm#<3w$4wW7E?Q$}AU^IxRuC_^2Y>>RDTe;kYyf
zQ*#?h;E?3IkthE-22h}JUqEM_;$@xd?eE_xXxU@9!C$8F@L@BdW>p&T{}B<8%*2r?
zGx(W40PDWrscYL?Xf6~4VH+}teMO0_!D}K#U>02s)u;}MN&o^<MF77Pif&%~og5`h
z$t4Bqw`l824C8T9Cr{$_NO&95p!-FX)1L`ic&rDeP8z)|2!sly-@snfg*WE`u^*t6
zc(E3lv_fo3)#AnrCfKwm25ekeXFzc$v%1bJBKuYF3D;qX)6CSP&ToOWV=jI!=cBVf
zmxfX?BOz+=+JYH>sRMJ#%@cifQg|~Gi3l=riS+wDw~+(dKi<ya+%P(@-6~*?8zv%L
z$YdANzv#7ZXZ&>ApK@1i@%8cXc_Bb+IpkUS-YyawdoM*@_9s?s2NEcq0~nu~<CR)o
zYCfs8+9L6UCLnKNV9G7Pv<Wd>99B<K0n&AzL6{|A6Y<kBkzSEm{Lw##8qA=eBnSK$
zk-)~GW>P^uVo%4Q%bgn~eIn=@jZAaonEd>F1e{}o8>gO!vu7k)6_}XBX|rflN=-62
z1qCMpbNEP0f!7MDKjNaHNspp#%myb_hO`UMNvXFndbC`tuYVH<gTWkdJF0)ZiwHc2
zAQOGDL4f<_y_zgkWMs-!X@r!3Fu(~&h40Z)EviDry`zW}MZ;?|Zc%xO9lfQ7yitbg
z&PXA9l*rsf94CYr`8z?s7yP&s(UoE$JF6E+<s$|ZJPA_e2##HGd!6#LE(0_rT>**~
zf(&~FmFXU}`xY6DCtW3j#PkW|0`eLM1lwSwb<#LCkqNRq^7&I@7znC}2E_*<(3uRt
zIOhRvn1KEa6t#=TK8gd=&gr^1kEeU*W0E$NI9cF+sYXSL0^7FSkpRV$0babOfS`6@
zuOE`b_el#3q6RNVF!iInN3~}MRR;D`ekZpUdB>MZ-gR?KxSruDnomVpoA~yShm(!%
z4Cx$+Z}Nroc4DBW4edOvCWEW3Lj_UIwKaCeOd`2XK)9*oqGwq!gpdLUgO(i*T#YW~
z5QVT;+kgLkAB7As+Raa2BMuTg4RQ>^zo2R917s@dzYYUcJieliR6AQjCbT@4r!ef(
zi@c5!P5C(9=_5ycn@CU@;E9QQhp4`U=UG5f+T5xO9h#IB6nA5+C>-DEgy2LE*$<$i
zk;yD*;sfACPD@@WfJaq>LAeIpB3CnEy1<v$;eYf!6j2wUHRe-YHjTizTn%zb0HvN_
zQ8EKf-mL`D6m5oK(<JP$N?3F<7!QW_Lax{p`S##h8GuZ@n~Rv)(cd4&(a>Gh;z)~j
zeBVjY0OLhqMd~(1`&K)D!luGG(2KQDN9iG>`s9fajD)ZT*C~iC4R8<#EQRQ=R_*eJ
z4~ck8p#hl!ToEutC^iJe{J>Hq6gd<8uZEw2jzx23;=+$cs=7S*8PXj>>+HlRDHvB`
zc2$3r3qAq*t1o6DjSn8C$GZ<x8Kn$MG1~N<hys4N09YwT#^;9XV!TKNt`G^bh|zVK
zIg5t62ElrD>1^I$HMn2sRQ+flDl?n&Rmm7VmOTS?m@3zVcB<+A@>pKdV?V#^4f)al
z;&m9&SA}TS?6S11QqI72LtdhwJUll74^k4~b04s$)vWXJW~|&eUf-a2Y7L!L(DEx0
zUu`EF98aE#`a>8$2n6KW66ERF9dOlRZb2AkDnf*#4mr#12g_7|8SJcu!sPc`rP(PO
z2jviESq!AmyxI<TBXI~H!OOKku`E;4m=0^yagqg_T{twgMEt5SI@jPa6Pl7_D$wX<
zAC)`MRcYP*_bl+2ZU6TG|9>0L`uDpK{`1w4|1_TeZOY<@>pJK0m|u7MR+2WAa#rq4
K(y2fH{=WbXjG40l

literal 0
HcmV?d00001

diff --git a/datasets/tokens_by_source.png b/datasets/tokens_by_source.png
new file mode 100644
index 0000000000000000000000000000000000000000..9a740ea21ce6412c3edeb690658124bdf33008cb
GIT binary patch
literal 41755
zcmdSBc{J8<*FJoulq69?MTX4DR4RlBA!Qat#waO5sF2L_oS}hC85$%iO6DQW8A5}x
zNF-yK;&+^t?(cp7o@c$!v)=Xo@wV1?$Z%bs>pb_l_py)T*k_Q|zTL}d*V0lbl;vuw
zN;(wEqErfHK`zZ=eC0@m$0+_I<*aPztZQ%W>}u|Kn6lU0*}=}<+3u(Xr^{hSr=#{K
zHj8W%5f|n>;_U3;BrPg>{O=El*gM*YzN4iHz>6$(P&IO*P*#|ee->oPXC9?cQUleL
z6!hFK4}5SnWZM63@zB{gK3ZXuYJUp_?M2k9mnjLGg*>V~l;^r7@5e0xA?={BBf_kb
zZ<H)w8Ro4!IKa&u8O}}fu$`lLkbeQw{`aki2is45I&=6%;I`2_UlW~@HV9l$=M9|y
zQzDVrvS9u%s%s4cx#$0cnQ;Zre`AZcpTPXr>uH!%=fAGH;eY)LW7YJWyy_V%Y5jBt
zSe5?#nN;Ch#Yv->^x`m=x(^dAuZ5*)2)oapcM)heJivTD=};T3w(-T?^S`pTT#<?K
z&!3}<mTAv_J)KclY5wa`Dz_Pw=fCkk`-RUT>|PReR#a9J3InXBR}@ox6;qgG7XNw0
zATK|GO`I`HZx;yADHsg#X9=#Rt*UusF8*hwRJ{!Y>!~CXDGGeMx?Yz?F~$qHT+UT_
zSwUwV*1wBy)!)^M=44JSP*pMJt~ad?{`*NkUl*7-R{nQ!H~z1m{{Kh6;9G*Y8&Su;
z$1Js|oyftiGtp+Wi2wQYCA7?`s{D_wsDwAs?uuMJpysabN0oQd-Z1oc9HvqQ`3cl7
z{k_!hIk;3KJOr`SitTp<gfF@;N!0ioy|g@drOfo&lB$RMqbzDU|HQE*)4-{H^RbaG
z`FCfyx&QZp|7tF}lx^L*kix?4G~SeD%a$1$8h+d_GJj%(%;c^!jQ2cW$0p~_S{u$@
z`RQpqBMVFX;rA64R3aiGN8T5TyL)(O7dmvhFW;k76jQh8v6Xj=c!{*1;!;0FK^MB;
zucWgpiB|jM$&-c;T{d}6equ?_$Vk!2P(RX`#i)Kkh<fwp%{baTe0*Jf4Vi@}yBA;6
zNc1+!w@%p;FTyHjMiJh$sr36x1)if=v}MaIA6uQV*gg4$DVVORI^#C$?{kvwGolwb
z^8LF?qQqe_^NK|i6BF)tZZqMtZP*=dhG%an@sJTQc@QgV9wcCsn4dD{@Z)8*YETG+
z%!~b&cG|S7i)$7B73cK|r-hVm8s<?gU%Tb;=(f8B1xn4gOkUqTw*T^u(>h-rl*Uf$
zba4+!_muqIc*odGlTo=$nsMH9KZd(&6QoZG9k_oYOh`ZH>fxtxMM)2xdiUX}t{%8+
zJMsOU{mGe$VY^-)Uf#~>Z>76$9=O}_<x4Q*1{F>rp)WB~XU0CC$I@iaUN_3;lRovu
zH*I%B&!i34pPfm)floV?kd-CyzQkjleOvKlmze9fHf2LY)+e!o3zsYjFn%fH&_N*!
zY5n}7`G*Im98R7rzinRW@af3~;oh*dlC+uou2hWmHRQPsTkLS3U|6`AZs(aZQ~ddj
zS=tNV-M6p({`Fm+U8@AsMgzLafEBgXAuOe1wLE_wAe!gyGVR?F>-}<bMT?7zU)?)?
z(D>1r)jO@vWo<qFHcRsC<9F}YnVXv%=2=i*zI@qH*1kG;^^w=9)GA?X{qT9rdY&a_
z8QffJaPvThbnnQgrxREOahn%xTkPA|@N|khcTxu57x}cGoBJ7K^z7NQj;<~$78Vw_
zGiNON>N6sre%kIb&|))Z{P!SObA5R!KUX!>S&6qjpRy}JF?(k2T6tVtoWOws0mWUr
z<mE#L8cyQ*hd#gFu;thr;itnr&o6a9OWdrwXU{^)<?Sb_tgWrH_U9&jx$OAqNecei
zezL2|#-^dUd3TDU-^i9rf6K}p-Rlf@ZJUG&i;B$O-8=rd?a|qLC-HdiADrqBsr>Y@
z+-C{J>(1e))>4BV9bVWOY@Fh$L#=lXbI~ZqjXt@sF>`7?N1C2cR8ms-*qC!wO^wC#
zi;`}WBfJ;49>1t5zw}Sujw<+gw7Gch!Py@oeETjZ^cUW}D`{b2(MUfozI5qQibVVi
zkIz4wztyf<waVPq_TtxXPd_=%jJ}jlo^h?HsJJ7%qOqZ&vMOj*XHU;H7Z=K_=9h=2
z8ycp&3{0ezuO#nRZw_qkZAkw<y7Jfit!A=aof;Yz78M;G-S4KPwCLIM=S|PkSKGe1
z5u_Hww}5h})HCe%ZLw#`@*G&JBk%6rJkYMox5~_1n@8UA3(w}wmU$bFNFGYo?3Vl;
zDD`{sinh(S!eq~Wzq&v7P-{(UT3ViEE&Ud|mLaT^l1ThQ3f|4k(lSjtDIp=@@cU=a
z=tK_{KmPt>)ykE6z3!pe*@8Lx*RM@I*mp(RZDNRvS=i7EF=lmeB%X!VZ)H$Wt1C-m
zLyX!NpHmhVv=S0b$>Xl+&S7pIa(k~F`x73f0f~txllI@r(hpoKY0D(-Iv7gNE?w5#
z%;L@2@vU^OrZwLt&tuvJX^4(pTJMVePc@vAg^{fbzx+&YKX=^B%nSJ{`-O~C4_&PK
z;keY-I1<`<7Rq;Lk1tuWgiYR)O<Y|3_Pu)s{b$oNM=PCF{JonyH{cn<nt#^?)os~L
zM>33y99tfo+K6RkWMH6V=md3in9WRdM`Ri1ao;=fAvl_MPmiksWoNcdhMB#61m46(
z%5$hg!OhLB?A0p<N=IK`kgxBe%Ti7%r(WQ@Cr5ks;z;C-+qXYD%d&X!Vli=X?YSlM
zr6OA3uC>yFoqn{dZ=dW|;o;%wKw{Dox*(vR!-S;TL?06!?T!7U?6k7tiH?cUr_Mxn
z$}ahVQzU-k{q|T9leKq`HP;}VPkwzX>Fev8RT6*w`o=HKxe-{k^|Q0HViFP+2y`tx
z@s6+V@d*im8~5L$Ow6s<#xeZdn7xX2wNU!^m#Tj@f=Az$CABT)=_dylmlqMjfk8p#
zkE1r^vz?tonYpC>(B(K~=hL`N2te;s{cp^_zP&4O@SZT$!iAk*zFhqDO~kZ>ZM5fk
zWn;E3IXP3~{gxvi>s~d_jYW9$)$=+JzF$Mp%F^PB71F0&#iy||V!iB0_io<#q;+}6
z>eTN8t!Zzz**9E!Zga)-j+NT`A}4;$UL+Gnb{UtM&r;W}T}zO073q0#W!gyUo?YvR
zQSYihhw*jhr)&mE<b;>AHQt==F8_j=w72le;p{Ts?>aI3`C;t^@tKNxK`VJnT<HJ2
zKeteamBru9;XbN$wz>H8=OY7e?=IVB-=?IavyzEcIqswa!|kID*F@>(*G!4p=YU@p
zQgWKj%Otny@0*^)Z3;nNWI;9FZDGNEsKh;}K4VWBa_!`m3-ch%IpM&b(Kulv?+U+V
z-Pq5T+vW>n)p<+iX3zTi`I#dXsj8}$q1ukfZ2a@cjD2EvX|>};OcZ-xTrsz?S&}F`
zdJso~vNPZM`KuiLjl#x-Vskpba|I(~)C*+63@rh(<Hwg%#5QmCNlIGZ)!nV%eL+aS
zB5d8Z5nj&u7hX+=G@u?Q9F(b<Z0)%)k&bsyTvg&`8@+D4n|F5h(OI7P%92W@aMSwJ
zcz<(kXJvr+nXx03ov6Ix=VsjzLOZkdx@IOvKR0A*mMos%R_+hi47hBEI?73XK_*_Z
zaN)wSFHOr(7RwQ(^obh&T)X_uUR_oA_vP^uC^DumGQKr+fMF5c8i_@_{ORiMo6Q##
zz1vw$8-!$K+3V`+io*H%`8UYQntavz`|Tm*l8druw?1(0i|(J8PWp@EmA+-`2TGqF
z;c<6&cVz!DJX{{MiocQmFPx{2z;2>)Gi^GsrTOH+g9nS<Cw2bgg(bFatHtqlVlO>6
zC#RvI@t?oGc=_6ncvH{v=a;tShkYnKxlqPsz$-JemnC4spIA%v-er2Fbc^FhRRETi
z2yNf{_qRQX7hORqL@f#Dkf-0XXHO`bl!U3z-~Bv+{nRfwqV4PNZ&4S`iwes+JJG#@
z;uRFM6nVk#b=vN81p$AX7d3nk0nU7`NpI?9SM`J#*R5Mdxm{4;%SkiI96le0wDbT9
z-<En)Hg4QV*?Hi=+WLkDf)_>URpytHI^@9fCo9;tdlfnLT1zprvXUHK_TdBjt^oS#
z{^s1#|A-^csXqSn^z^Kf66d~^NDbPho*rbs`8AW2w6wHJ{N_W6`$OiN`r^Pqnu6o+
z493T2e@>epKOR<Aw)1m;|1Lz*kM?u5EqRusn*`?Lv~MdCW=2Lv`Qyi`<Kx-7nMk-x
zJ$^oW5a;8w@KCW^l>E}5Kc6gXrxsQV3FNYz$6AWR&#6;8JZD$|7q!A_tEz6BOU=LL
z;S@Z?iz_mgFD^?Jx_(<vj;lp=aEf`Qf6g!vZi@Tl$Ru<0{DL_rBRtm2o+*3$m<A~{
z>%d*1sUP1W6#ZxyP;j~!+1bNxnLO+{rZt~b>W|}WQLgG{?nAoOz1Q8-)6??k><<<{
zZtf@cK+K663^FzQGqnU#v;?Zd6@IOH>;~+LlF|c3QoGIR5;Zgj1kf`^oi8k8Vov+@
zD5@Lt(1KDQ>ai1$y}az8SGq4vUbnmCwGo}Hub)=@t4%3biC5xTqqdYWFrtWm9`V;%
zkmN{FO^aYYd*s(j1ci`JJX(We)Av`u9(^SXd4%*YX{SCmx>qs3k~<JLqBE4eElRfF
z*8?b}8*mAUi4CQ7dvlV-0D$y)_)sR8c~g$Tc5d$a8!f&xO0{_r%)egVgB^us5m$ah
zpibiWt&wvP{e68YmbKvohFeLr##{Z(7VO9tLc+pyK8u$tqR1ntSCBeK8JZZjM1||X
zRvF&2Thms2+Gp>jt>)9;zuvxkw>pw**NEhf`91BUzk5yKd4l*tyg+5~=~1l=_0<3`
zWmQ#+_U+r3u+4rwV7L#G+*P2+1o?Avq+()IdR$%pZW|Z2?M0I8GFR6h#<d?ld^q`~
zQLw4`(4j+a-v<(<=l&kE)eQJSy5-AJf$tjE<{o-Pi~QEq+}GD9U|e|n-+ypC(6;ps
zujlNfO~R#1X~&vx2^ihs0cvQ-B(MXBK49>}!|N~PT?g5a3zFr7=A&7aA4zMiyyuQX
zkIwilU$@n)yCy8>!w?fKHEISRX)u7pLW+*AZs*63zBryAO>O^v5Gg!gXlUqd_esb1
z#iv*C@4Kuueoja`brC-F3V!}f;a>W6TZPZn|IITmbkJ2ELY2U>U6^^I_%{)>u&hqS
z%ieh&X#48w0`xx@L_6otMOKH_Lk3Bd0lT*1HMD-*PfsGR<7vy=+vh?m=Ql>XJjH<P
zOJ=fXMNQbc&hBoX5|8PM^6~`~hq+k~yZ42R>$aUxdY!8L`l0K$nP=+r?<}oxQ?U{C
zyW{KEFfT7J;o9r_bHmZKaY;(9E}a=!_79eZTenwKSw$t)^XF7icyq2PM^sdlZNU6u
zq<ZhZsTh0V!s-+4C0F_<zP`I}e)MRz%U=#k$gT0f#oqq@5M=H&tGeik!H3_p3LVTT
zWtX<uGpt<cZSv411vq(fZERqms>XkPe5qrRBBD0z?s{|cpy2~&L4zBH3s6TWJN5LK
zNO}c|3S|}N{*~E-lySsQnA6E7CbejnJfKrv+|{k~n~O-zRm8LA<%t7Llm4;`eF9={
zjn}g_Erj9F_wW9kOfwH_l$3au<3VH-*L%{LdiAO?TGUS3249M+;B-9)p5yo0jOn8u
zp2<DgT?10!q~~EYJoUCOW4MR5cP9O5k)7j^=jQ`1P;P&cwNiF|<35H(Y_ezcZvLaP
ze#~ajzizPUyZhf&6^+hRY((KN{U?@c+3>d?+rq8sm-RGW^!m>g?se_+M?KYhgXW&m
zGvi<UzBFAYr{PG;Z7#Go@@HmIR)%|%&vlU^DnB>lgZ=K=V~F>w-$&N`M#qmBJ%Ly)
zf!N=?OT7{30Cp+IMb`~(R`@PmWoBkZv9Yny%hsXBA@?;F3DC(_0u)P<^9VjSJGlgi
zWtWP|V$%PkZ$9@m`8>i{i6>^M*nI&yS)+2(;p7x8PPxJ5A-`aQ?Gs(D9Jk@F1mKC9
z&^10D)89K$m|E@~U)NURap_YrvVV$gQw}S77rf^3J$s(!uDpnsEC|rx+o{P{SurFV
zsCtGMJkqt;Y)QVt`{y<Iep71c*MLmh^1)wxYkP#Kw@=T^?4BI$4GalcNCDe(6>kgt
ze8XN1XMtKtsea>oO*gLJX}sRTc7NE|__%hCo*)plko@Ssz^(q!C8_zpMH&3%tO5i7
zXN>FPW<@>pmxJx4$;s`1YacBQ@FxyJ&F9x?`D~ZA99>CLPu!-1sW=ws73$)>=1;7i
z7~<_Mu=|Z0H}p)CfpJP_r#2F%q%MZP8wrnwj_%q0$oU^npB`(P-19O?)(3yS^aEL>
z;PfSkTP~(_G+p1{f`$2a2)agFJwxJYGe{Ee%*@O${VRFZ$q$;wh0Z4=S|&W~_wV1g
zpBWSIdUp7<wzjqrtu|U$uf#;7e}A!z(?6%iE3u=UgM;C*u}s^yZ@-HCku&Vn`=b2y
zYw>o)zqxS`7_2E(c^UHe1{5iD*`MFsTsJnpxm4fXeaA;fDeV81T!r~#q<SBDo5~9@
zo`PK18h+{0C8ydQ^XKM?%FXrv!llW3Zjw*6l8`%q5ZJx*RaI4Xy)@olSaZ*lJLi8*
zh_z8jX_Ijw14)*saKm5kw=0Uo_qzUr7{pI}ZOvbT3aPwv=T6VwzswYsI^NU1y+j$D
z)2Pg{`NX{13{_&sjvW9><1;V*mGTpi%Fyj};)K%B(kdDo8++Q!KSwH+DqgFkqy#N1
z4Gj(6hk`o88FTpN(WTyAAuOUjdsOFtw%!A;55x^J-t%8H85POv$Qz?FOa4BHgY9_W
z{BLnj83Q&X$)5EG)~l$kT{1X0NV8(a5(*(?&?y$49<}~YJbCuVU<&@x5@13PTNA^t
z8JUzsPk?U~jyfo%b<eKIY%{GSJK86dsNtnyaQm#C5R<w9lX{*d7CnlYkVi*eb0=K9
zXaQo9;sw&kdd-iKkqWR@B|F&hi`ol7F8|3DgTayi$Q7|05p|uNovhN%0%+k08+-iy
zix>2wrX@374kX;EarQ}0NFA+ZjA9Z+PDqW-ma1vCrI`n9ju{4A>e+g!S}BS};k7k2
z6|Y`t{cb?0wUu;s4A#~L{$lo3x7a>kk*kwI3o6NxZI7mA?fHdEv>u%q7p0B6R2#uz
z`K2-Y$cG1#IeOXNIHWyi27#SO(M782d}m);4bUJz`4jY4*l#WqSa$8U6QM|%oV>ie
z^5>Z5_d!kjI)kJ`$4=6#G+f(T`Qf4K0ty(F?H*HGroMj-S+ZiiSx4FV$+csTP`J5q
zFb+S7J@4bQ>sBxOb_a^?H4O!{sJBmikS;7NG()Rf==e!JY4!@*NJ{`X()(hO{Ry4}
zuA3awoljlpVfKuIz!bA@+kt)C-*np@ye#4kL`pgOnvTK2q+EseQ4q^OL<=ZDH&v(?
zG2N#(8r;|!!n{e$wo!nx^NRGTA+(wnO*#68_fLG_pJPYEXn5cN73KYh55ry6%Vxm%
zCP+D1&(3g?Jxb^HGofBED2x^#D3_M<w&1AGwY`_Rz{k3ceP#sgQjZr21HDmMRmCH}
z*_!KV<%!LR3-ZmH2#(}WZeT(396A-jq<grmxX!H{%qc7TGk=zzgt^B>J(tys3iHu^
zN8xxSpPOBS1>N0Q;isP;BNvNsJ$LS$u+g25Aq#W5|0}ApEA8nNWhbHz?JvqkC6-bD
z=QWb}1kOEb*LN5$a@es~eI=N4-TFqRZrg`WHVVM@TM(N0HZT37q8O!}KSv%%d@e^9
zTZ^-$v{8wdhlfG|lY58l+}!x|a6~p>PF(J>rW`*|iykMpdi)%4of;dXo&8X{B71aH
zy>IRCl>6J>+2Ue3)822Ng$S!?InX^JGj`I{(IbhEx6{+zU5>wAPG`ykgx8cP-krP~
znjmxVb-+f~!%I((E(Xr8K<3Uv&*i>M&>zG^1z`G`HET#S2IzDijTjd)vgXr;PTWw)
zZVN=eR67-sn3&hXSg<ztW#;f&54PNwwRoDfxv$B8@Hc!BiyKQovxF55aPjcZAMKT8
zWfb7L-HM7-i|JUskR%9!m4(jC)8_eRkPQ@7b#?Esu&^&~;Be!x1bW%gIYvbxA<M8X
zV(t?TU}i{PX!}V=AXZhwJ?3j+Qj&&_z3R$Ux{*;S6E{jXWE`vNA3d6U_7}hTo4%c3
zo!&l6Po6Q;V^DJCq6OEi{dgL7Z4?~d6ss;`!oyw9Yi(U~I%)X2Oa2JjoekI)LXLrN
zQdCrASiPD6{;Mb<Up(KrYu_@Kq9i^_ZJV3?*$hGvRAHmdxtBj=GX~opU3WpO);Gw1
z3o}xPjY{b_H9_->>z&$uC^@2gd~(a7H1Z$(O>_GFvGiO~pCk?c!@b`+j_Q2Bbdg$F
zt)YU|<oZgq6z2j018K$3gi<=beT!&p)YC|oV>d7`Xt{HEnUKMa1k=_=1E7wUckc*y
zWppu{V}~2V`t|FRA6c%a%s?xkKnV(Qe@ak`QtW=sn6?gunPb25t49<NkM+x>EBy(a
zG-6w~s-S{_aSj)@M6RB_<1D4BrltU)gu*Eyu@dCYcJ~P}uV?;U-{KpVbI7sbnD2tZ
zKtMDb-jg6X5M{_ED28Q?jZEmf+=e<95X^n-*s(1~>v`JG@y+Kk!N<B>LZa1|@J?MH
z>tb?psTC%K{IU`fSP9kgH1CHgNU1fTR2NeU>{?d}3JM}!u)n!w5&{mDTTG1M)~#Eo
zr(DtSub@QMSrO#|Io1bQzvBJ-9dn+n^KZ1YPM7No`Z-gX-ANaOjaIr2zOQR20?ng@
ziXz`Xeg9Z9Jz`InA7T3YkB3ru`<Z!VI<TOEMRH?AU|<=zsiA?^%@nUISJ-#R$~L{d
z@SD0(HTm0nfa3c@9gHbAiYPq?`47C{uNZaenEwCWgLwpN@Rhw!r5U4j;}Y7HqU8Sm
z7#IcBOdOS|zRYAoL=zPZ2*JU@fkJs-dX57tTZMK_8`RU-;0GD`{NHHk@P?SBuVR+A
zu&wuDQTtXEKB~{OS#qbQ>&D+rM|uOnAT4<d+uGWI4jwxeBZNukPEO-f(b9QIdF?27
zAA1rX(=*8K+6pNK`6D2A8gotMn}?BLH%LhQC;4{g&YcQ0ZXn-0rnmm0$^XUOX(lZr
z)i*FKjG8hrHD!VDAfhIUN}6;p&OZ5z%h$JlAhUbV9hl!Ov=4)fP5CyOAW#_~^;LlL
zKH2|fld#DH$ZI#eZr<EPf$m3(*vc|_xK$;Tb+}3KH|wn3Ml^kD_x%bKbR%{>`_ey+
z7zgSI&CQkD+1Z7nqICi+L#SE=6tkE!e_L#PJWKJTM_e@2b83IK@*Q46SU6bq8pCRq
z9~Y|yxHnyUbL;ds$rJBckquwu3X?;GWAw7^r9Z8Fyl;BC^DnCB+LS}?aoS&~K{dp~
zfGarv@^A3#tgE9V-POIh3Ut!SA4YyzDD;-7o2RAsGb{oJp!?5m&fS13=fsBxx9{Jd
zQ`epc&sm()Hx%_QZ9NWJ`FidJzxl7ziv5pV9o#QvQ_*4nl>7(XUc2Tm)cx<R6j8l@
z0WozHa1!6Ds_@|^!9VCMvWjqCzy3t?{wf9BiHds0B!X4-=G_@lR*#aGkAHTfYOApc
zWbeH$XmxAJw7=u8pyUynx0tAo;M<(Md?xUUP_MsqVO~S{4`)Zl>ECasG^VZ7Vift1
z)^_n<t|4|K&|`)&BMo&iLx!FJ<5LsrQtA2E4;0vs1FFEsIa{_Yy|#E2YH^DFVah0T
zb>a1^i|1osNhvvY{V08(b~gH0_FC&tf8I^$zh9yMZ`>*WdLX6$nVsdo`-NZbi$Brg
zocQlM_(XJS{roO5GG?oR`8cZ0>V2a_R9aVvDKPBpsB*Qy-vtb8L_f}8CBeg16g1Yd
zjgT5+o0jbw)TNe4Ec<KGiAt!qZ0D4fHX2|(klXQc-8{d}ee+*G!}!0q@%Vx;zhDs%
zNt>aw)Q`LO>CH_3T`D8%c1ERi>aFEz@JUiLx3Uswoa#?(7Fhoz)cbVUB3aqbi3#Uk
zkE?n7q<s=P=DzQm>>^nY*423i{cHRB>V|s8HwrL)D^ZmA${Dlf>g|ioOCA@i)Y%{6
z+K^^;2;#-U1g6<L{j1_fXur`8eTzK*jKW;=MZV@x22YpC0#EkMk$vTFTLwZr0}f>r
ztDITzNM%Yw)^Hum<emOeKI%T&@!1lQSMhabCM||vYYjEjh1x!i9J<3P7P<7c#MY_Q
z_LPZO=X91=rd%Ft%ud{xQ*P$}w0AM>&@MWOHD<<!n)sc!{d5A$dUUr?cD`Cj7n0@b
z;Wg*6evf*9M4Fl0ilt|$#xzO?#N(na(`H;eXUalzM&fhg$C(|Qut|%!?ArtWVdPpQ
z^8D{U{O?UK|7)`1|Lrd<Ro%O{`hL4-Na@_{!2y8-f&gInUm28#s_@h%DB|+cp4~Oe
zm-C{Fe!{^bdZ-GG)Nn`n!n`zw)v7=VZZi|&vFaHu+JUq@CLZh!EHV@i4;2*^Vi>TJ
zc*E{wapXuKK#m2vYaq7RmXierpsK_?rkqiwtfg*g3EVz2Zo7z@x)a?1qab@<G-PDs
zhc3(EB7m6!ovgpH2#XAJ)V{aql_2&|1KilP-0n>4rS<lLsCJAmfqAWri!j~t<>Kg-
zC}sDKzZ0{pp`#>zqxPH`-VKF1aE&%1koL0dS>_cSa!Lmd9EcOoo}+&h9~tR|uB{8K
z)6m!$$g<tGwnB7tbUZ&jWwvi;#m8gaYR5|HfKWL!I=aizQGzIiP{u*aANUxX9u~e{
zHn_Vsq8kDZpqLjY&F#Fr^8n3n?;T%u_Uzfb4Yq#XW72#AMoDqLg1Xlz6j)w(9anyB
zd~oU*I;thd-xe^+1i!fmDF8a}d3a9DfvUtLC98dwtRVQ4h^dw}p$Z_^tHK2z3=KKh
zwU@9%JXW-|wY@jo?m4~JSMRMtww?v3W16K)>qa_jAjiv|87rF@?pD^w^niHY304={
z5*pAz_T3xZ&Ys=H;2V+TG|(cxJmWQecz8JYpCy;CT<QG$dE&JbwL;m=gZH<@YD~M2
zKHrugW=?f+n>{^+FqA|q04%eZ8tWr;EMfDbbb~iaORptw2%?sg#@m+pX+VUG;2gc+
z1$E(r_vjAZvr|x4r(L8_24<%N9Q}fYiwUI9;SOb*oSI5CE%p4|lDC0!71W~GmMv9f
z=NIN#KVRF@+G;*I@~O+8sWn|KCK5)B?o+I?XSP6c<QYG`NkoJmD^!E-$^66#`dzzr
zDed09OH*?NRIZ(1@1Roo!m&g;bBo&5CJJZRyF&$nvb56{$BCE(EBGufZ)jNc=FOYd
zwrAUpztu~3?pn9mnhwhPGI%ERt{dzf*nZ$Bj6lQ(#;2JSij7GD>5c_fz(eJo{c!?q
zSs=T%GnBoms|r3Hot@_iyPA~rO9qFV%NI>u`G*fC1A(cuw6qmyp<k7rI|p+Q9p2V$
z{L5;%Jbcl;R>D+*K5(VXHI<*U)8C51Gt$!uok#eE)YMdp>h$Tg6v!H}U+-=d7ia8!
znY@GoXs;A%nEC*=knnTY1}rQs&5s>hM#*UvhL=Stid&hOo&f80KKgcDg2Y2i9n8#&
zMMXsi07~)xE%)s?z#;hn?gz7L>w%u$t*S~h^YbSF<N^*3j-~`ivMz(~1^W~mL1t-P
z(^x|(!%u-jUk)Lv?D_NM=u#8pJRTG-xLXEFv-8`x<ZeUoBc$_R&cCu5$`b?hK%yCd
za-n#^A7Eaet|lfXR)({3vgg^FiOI?GH*X9^ViwcZ_p>h3UMY@-g|ZXv53vV@?fEIv
z*Hs<D$iWc-cTM^G_iT7>qZ$M!bQ2}@Oj%GPgjI3=(zvv|ygbXeNP_tHKKGAiFQORS
zG7bXiXb!&L!O@XmS|SbsT#xlPuQ>Ikkx?c!3m<Q(ynLy!ef`awHxc<X(5`T{OdS2P
z5II(fr-n?uIn5yL<;d@u#2$D~bOx|RWM*WTe}0t$@haVa)zSEg2{+K8z0$LwSW>a$
z7nh|UCab+>&C+e#wh_)~>(;GVS}9Z%&J7zD()uZ4N0zNvQHu`DocM~YZEONSb3pR7
zK(rZ_dOiwVT0P#MTWawvLHu)l#xe;Bi94YekZneOs)U7xMqa*5N7)Xm4=sxb-TwXi
zDPHg(;cQfeX09e-U~V4OolsOHg%6+6snj)vS0b5KpON;l`-?;=Cq8&0q0=s3v0?=s
zs~80+H`<*mhx1YRa6_7s>hbJ~UPnjACnJkgr9j1Upj8cn?<W=8PIxMudY{vyJ!-QH
zIe#@rIPd-%d_J5sXj>OhVg<CR2+08l*=uAs<OP1@1xee+$0wgPHLaGPok;Uv6_g9Q
zEigE^0vSgeWkA?cO-=1c!?oq)Sb}H`iPhT1x^`{X!*3;(1xFiRK>^L{4!njO(NcVR
zb1;)&^zi|-MxWbD<xz+DfhdUY2<&DzI?jplX4Cu~9Pk^(=UP5?Z_9(?kLQC$0Gx)N
zC0wD=$&9NxIFkG6hC)N>AU7P_9_4ZpRO#Eh$GBqZ7}(h8G&MCp_w_BHB<(ouovIuX
zd|As~IfS_y2gPxq+X;QEu(Fw;+mAtZh&B1_+qQ1?hr9w-1sxAul0uWc99PXAh37Hw
zGV-d&GRSxv;QW<<t!WEn+%Qzjajt`WGFV&8tgL*|iCIFjA(l%L`}&%cY;3o;Vv3fe
zq5H_krSIRrr%;rGnaXhlmG%kTB1C^~>8v8?>-;c3KR>mR(?VOeFu~<l2_l&sU65x?
zqKDWhdamKe?d*6?AD7#2Y|M73)N_*y-D{*q8i-58FsN4xyHY3;aaluyZk1Kn=#L+}
zAqvgTPTRYkK5afR)R~iq4O@s`VGzIKLZaPP(1Bz=*Kdd5k7L+ry}2{ud0`9{iri%s
zuUhU3)KLA+tJ0sew{s9)tE1cpKAixNI@fEE>e4T=^B&%GpkSw#76U>c<ICtmt3%0G
zuX2tJwCYwlU+nGa@kiE6Lty}4y(m)Z>1p^pQ2%rIK4oWT6FoUl?d~ola?5<jASpDJ
zw@gdJfk(<-zFgtr;sR=XH@K10@!oPz;O~Tb@%&!#0Juf;-(~*G*Et|_Mb#e-AfhY2
z-|q2)9qI1m<j7%-i(8gXO-&izv04fQK@5O-mcczleM5nMedq8Vz?6z7PiP6}3qFP(
zZV)8AG`xdFEt9gHoiGXAg-htKB1nlPxW2v~)NJ^sgZETRN?XoKjW?b_xIj*dAZ#bB
znH4x8%Bt7cySlnwq492c`<7ckfChxt_OsuGXJ%$PKvTnSqJCz#=u`nAq_=KmcItUX
zh#`M^c3NFsU6@3=u~@9zPO#xH@VjWd9%y^C4peiw*Mfx|DE5#|%|Q4<K%-_BF%HC`
zJuE1qst*a{-X|LnGUvhc<+O0>A3b`MRnC1o60Vt@9UX*PC1vGK5DWlARd`DCBS!>W
zG{TDD^ufOPK~heG*N_xxV8oD*AFbd}-7VQ|1NwFXjuqZVTMDn5bE$J96Ny_rW+1j(
z9L?I%-k16h-8C+6XRN8INq~v8MO^s;?0B$0e=IZ+r)VVYFan6#ASJbikB_e?+_c2q
zXvFmao@!{UPuKZd8#^R~Dg>0itVi(bjsAc#b)pMCfR8iCJro9mxV}9PEJMU7g?+u!
zGOXkZOW?E1Qme3Mz-=Ux+=g7joI1;Y-=q_=;p>8<y5wC)I2-|Shd;Qs<518ZX}-m-
z60tsv*o5!7@=dYf1J@#ehrfS4*m(0G3zTPz-77yJnr?%`CqAF4={4)tSylxy!uA#M
zxKDLI&H^n;o&Nau>j&?j2;clQofDr5|62`66?ySuu*Bgf#18}O;EgYrp|?G@l3-o4
zCMY3xWEClaE(3<dBWw8Rj3lvXfPvt~35Rh5GN9<28%93Jup#WaTL=JzKP@sY&L1xV
zI-i@1YXLYK92m$0=E*nb)KpcqzSa|yP?2#sSWH${)}ejV=ir$mW+0-16*+$5ITg@v
z2}*zc4mU5LZsN?aYrQk`VbV({y9%a*+i+P|)z&(xG%n%Zupys)Gt$$A3;OpyjHM6Y
zyu)1l(01FEAM4^&KcHUZ0mrjJi6$P|;cGFC+ZGX!h|CC0I`TL<jDg|d{#UMCp{Xl)
zicCaKUsZLrf=GNN3IyDZ7PTu)4kaB!Ud%ktqeLBZ|NamD_HUn`*cf~dj1DC3-iwmU
z=;`TOT_yz%6>o=KNl-`Ps~DUYdbx+#7c5wiV|^KDcDvgUCqy5AFE2{ohTwfET4u;x
zgud0$(cz?_0trTqH36BOf~byeyR)6&n=|Xpy@t6M@S3J2#Vz6mahx=t-xqBr)-uFb
zoutT5*6&ZAoB(E$TJP%MKr9iFa94a{6MQ*%1yb$zZ|xCyOZYjsYAr$<QT1rBAOZRL
zPWSR;TnDd$<c$`6vmnuf1D{*eh3;ld!#5S8%FCal$85vB1qwYUyrBW_f+Un5_7$fY
zIW79b;nM(Yq&{fp9+Dk=d?f@P^~2CBFpq$YPx0d0dohr4&!(AkhTNzej>m`M5)uR^
zw^@SwtArC*Z?ge09m;|{T~ZDoaW}FWqf|yA6%`c>9aN|m=#9#1Yj;cbz+G}h#`O+^
z-?Ky-nCGi;W{avu7ZUR)tQ*J$0q}9!tcwT>t40IB^JtSjnjvtD3xBygh<@|(9b{$!
zopjakhbd}x05!uPZ^yp1%fB;pa1a|97<hHVXftDUP%pgj&raQN=&T@UE^@$$fkI5m
zFzgyHo?+xTC=eAF$5)s5NNW7rbOK<rX9mw(dn}FAdg3^Q2Vo(e`}yK}MS@z>RKgtD
zLhw)sR-Wm^ZWuFJw;W-B7!ARd_(EUM%tdf`PH@$BlUxU7()Ia<uiXO!p|4)OVr%}+
zvE6~Qc~escH~L=W9uV84<gi;(t|A{4dQ9I7>?^25%?%c}thowDT&r~xz!+h!xp^Db
zL;I_O<wDORV)kIig|2`yoJGLeP8fHg(=jYs01mST$p<#no6@4(+*C1qd!wDXY+($>
zLeT*6#T>VT1^~%yIsCBPfHTA05%N85ENdt*C;P%@4A!p*#+h#6CN{!H|M=Fv-@^mr
z2X+sOabUX8ERm-AT6fS1WTDe?AUR*KK85^<nF&@e<b899WEQoIO*nkwjvpVh)nep<
zfr%--Q|YbckHHT_wIi_$*MT|cdDzL6A>WazSNFr{CoB3Y_a7fzk+aug?1RhY@(z;T
z$)E%%-BhSZL&L-8IcbQ`fOOi#z1#ohrp0ci3y&VjB4ZF!0j5mM@Wrfy_eGF-iIu&H
zj#@AzA|e1HX}Bf&2vtqZW(V%C`A9ZfK>R{3wa5*e!<5Z(kh6Y(Uu58+&~2C-j>^|1
z9%pYHyidP+qXCml>KoWK6>nWq0cgo<-Qoq|xA%eU1AO}Ra_(83O6hwZxO@$Uxy@0n
z=^UH1GtK$)=P44)1dCP|KY#X&r?q|e{;8WsWKdCeoE}jcU`<`MXaVr3R5%1dtB0){
zCE=kU1OPI=KaO)ZR-5D&VoJhk=tf&%vAgjAria{qPTg%09|`pn2y!@esu~F_&#G>v
zu;FbAObrSO3S`2CxV}Dq<Z)5bhQh8W*?;uo<EYQgxg2PZGTt^Fo1K~D=4VRAx}n{<
z&>g-Wo!-mw%9L2>{iy&O!ipXpctchxN1g-s?3Qxo8@@_cTNS!qb}iswB?^TWWT7Zs
z+W_=u#_?^aOB>|m4&|ROK%Qa*{>3o~50A6b48q(BJ6fRgu!1DTF{qtxh1TxpFx;i=
z+*eNx9Se5prxGGGR6s48_HAbzhkf<<yL*0sxzL{&VSz<5u#u{TporEc`aP4^qrMT5
z%;s!p>78ipi0i9;BURR+N795I*mC^smZ#ln=x<029D4Ks2Pz~jZ9_1NXgFF6B1;Qf
zarb6aeB-c$dY!*-oF%4BX6sU?Q>T>ClEPyYLJCu2VtGr8;eg?R&dRSYCdS6M&&_#~
z?uyhgDBRnRE!>Gf7eD-j3D~F-!xNMtNZXicpa%UA_0gn$j@b5Y85eQ4esy&k!Bmf(
z{f<q-!nZwto*@7b$E6Gzby9_c3^+JUjU7e7-}bHrMsT+<O5wwYmVo}SoxQ04F+{YK
zQcqdz8?S}*N!Zz_h{ZWSf0RF-LCvG}6}r|1R)ch|IM{aGI-8UdFJ8<!ag|NRg^n!P
z0QH4Sq-#t}RN#yG)aUP2R5ZE&W%^CfJ~5VgI`BRKwHe;Ey+D&)@Du{$Yd}Qz>96-O
z(Xt7plxU)8@;d-aP|-qK7?ppJAvoDHOiSq5NGHxkLj{&5_d&Mc;i5+``uqFS;9-$&
zqEgcGys?ps@%56eb}e2PHty%<<yCHa)!5R)x<MuMvCgxY9IP`5C+vvsbFB%{?B{ZJ
zZwmxT9zk;L*)I3T;7CLwplF_-T&FAjmlN2m63!Nc_N%hKeq~->9vr^g;zUiuNyfzX
z03<j4oc{4TV-E`&J7Q$HckdqQ0HGWXK~l5B=W6DyqSm*vO3!g%ShHrbZq(>ida*jB
zP(pEa>ufC0yJc*2H|A<uS~tIqpW5UpJ7libTM-CsSZ+z%xX1C0?LLu1#eql`ooMFZ
zfI%uJr;hZQ`PO^$x`TR%ry4F124-gef`ToLw@lWNegvF_le^|7WvNFm)~#7n{^W^z
zul0>P4EkPsFK+3}e-S?t$~;&V>zZp?TJw9va0%3MblxJ$)5(O?K!GlVz4RW7&yTPN
z8Q&L=LjkUZ$ch3R-7f`aUo}z%?`aRYbUh()ryh-_6H;d|7ITQv<biYD-E*;x1dc&z
zD+A<b;NV!{<m7~+OpGD|y4OO1WhLE)%n*Q;aGe7Oz5;J0v$BsB13Ksch=T-}>cucy
zwyfqHy8VfsOZKo$M_HzPDg_65J;f@$f8>t(Clr6=_6tgkFJHfAB;%2oN4O|?Ebg%G
zIw3d1K%uhPhtrKHsQT3l;f9CziW7K}nVDI@;0CMb&++{yy=~7U?Og?*^BJ#oFOT#x
zDJdz~?E5rzs_|o=UoS*aB>m68*RT0sFD}{XxF>+LXkh6qz{1_Wef#lvG3d(Wn6rp;
zR+xDNl+j5h+(4Uxut4}%UMUgKBp$<MEFwg5M6-Y$RKV~V%E+%JzpvO69btOz=W6tQ
zr0GVPC-Yz6d##NI1@+f$wJSp~yavxkh(}LPPpnoYww%Nn-TED@?9$E*GJHpi0#<Nz
zAxa1u4GUY2-06=36U4r!*UHN3D$aGw-DB}Q){SV1J>T5;K<2{$76A-Q#V>q6jb@Il
z2$~!HaaJchAV?I)nSqp>hQO#W;h2FY!I;j8I6iBe7njRd$j^A;1eQO2x~Iyw%Ml@o
zb$1;2>hyiUh+qYzB5s~l`wy*c_f&nd*WvuL#j$(hgcXo$tHTxbu7X6x2&9(&jr{xf
zn~MbdOdCKqGp=7B4o-l3#|}1t&{r4&L|e)8XwM)}8xq3`B^8y5*RQot=@QqI0PN0N
z?b||;me4uqk(kE}&Fe>JUAId~asBQta9#+%4bQWf^)puVzaoNyOVKB4d(hm_%U)>{
zu)*og4BRUxKfmGvkB*Lo*pDR=>EM7%kUVB^>yo3E3o5zM`hc;ybc6J>-vGo2NdtVF
z^y$<9vIB{vH@T*{o0Z$W<!d7-6gZSBK#JTsg_Jr}hD?_S<Cz*uYT+u~7jjJ#-*@Sl
z8c<VUrcyX_=0S_N^AqA>gYeLJ!${ZW(Du@Mz||f2n$&mes55yik1uCDK>vYinG;uh
zz{tq_aU>V~iI-Z23luSUODN379Q|cL=4JM!KmD;%dq(`91hl;@+yQr2T<;}f<iey5
zvBU!(R|RLT27u8nay$$^NWs*UBPb|{f{6uxV3N)N4$nwuF7<!wEc9y8RFhWH9Am1{
zhgQm*{ccBqE#QoIM8pd4)*$af(Z5*&z7adEjht>tRG0fj;_pK>MNq1M!O*YekoOD)
ze6~a!f%M7`Tspe%V)8BC&rq_9w#LUk{S@6!A0s3F37N;zV)rXSfXWK^!BEDj>m)T^
zgn+^zwz+XWXD~y*e57PRkMM8lu52^Ng5ofSA*jS^x9u1HP6c37d&4OI6`Y-9!gZdV
zUcVy{2&}xUjI+6BLWg+NeFFln&z=QyO=fjmU0t2OzFiGdBG6tF>JD<-GdtyPI7sFI
zHUpLZ&s09bQL=ynBEJ)WD5v)mI;mohX@h5OsRs)kxM1)R!^*>Pk(J~_&mk8IwA%@o
zny~$3DhL9nkB=Ec*KMN*8?m2#v$(iI06jZtpfmQw-(gT5qlTE^@3)M_T2OI8;<l^s
zC8X%SlowV~Qz9&t%hv-)nKp?lFJZ~S-KAoXh5Op7cK8uUAtKh8+A9elZiE2GicK@G
z;Dynkr-C-kGR#YToL$fi3{M7HK>UNH-uiBf2WH>w+vTF_gTCP;K^5`-(obHQ403Eg
z$$d>TIUKwM;uk#e5`>I7@>bDz)|dzA0aB6cz#enlVY7#xK)jJ`Y{GqtdN=}PBoH0K
zVvLRL#&7U8%XC|!l><E(+nqrC`%V<W>yRywO6kGh6Kf7YEYS}Lm<Cy;Cck@OV%XE&
zp{(bT>2ku2jEu+tyMSI61DQs;GR2wD4r{8`9jiLFS}<?Ct*aL+O;+PGH5)AQ=KQSZ
z<0`jKwF-nZ$(sPuDd>-L#?z4_h)*0xRupR_wSBu8F_d6t+H>xwYtudr?AE7!8jsyA
z^HBZNZk8Ja#Kf$cZgf`yBW8E}_yP*>An76reqj>SSp*DO4g9JM9us{77bhnLT8YK(
z*9SC{<&*&=oY<yOyYyt})tk1pyZfNmGRGShI`z^6wNWA5aH@Pfj3SM!NMnE`ec_Xe
z0(@fL-ribiD$5xgjs_q}ccMm9BwWpL+NuCXqp+`VS6YBe0l03SMi+U~(J|UxJRgOO
zc-Yr&ITHElXC&ATnDSoVGLd0yVC*Y_*_9g)2$ZA(G^ke#MVBx>bnCWiR5>R@?){}C
zk%b_08>HDrmBXB{4|E?gN(>PY3d83=gEoQeyN>DKGF}7F5;GuCU5pGv7~zYkdZa}H
z@d>z4QB$*cwU8bqa<b=8idNudEn^TF@bavd@i)lN%PYs{(uRH0&T{`P<EWWqMxasQ
zRYN98JCMY(c5N`gn_iCIQq%$O($Z3#pKgeU-%6(GL0ZA)M}OnyO~zHLd^X#>C_}D6
zKe3uTAcnmUJU9Y@7S41suF9QRzg|vGZae1lQAp#D51`{YQlB0KEE-@eQVZlum@^Q-
zTBo|Fp)X=QeHWrO&%AOmB{6>+X&FwKakD#344LE3gbLtrd?qc~8W{AJ0ap-mXm>c<
zg2+kMgZb8BkAs|Q6=Qu34<59HP-9X1Y7Jtc8jC@OZ;YppsnsD_MIh}FWQorK#~d@k
zKH%tu4;8cCw{Nc@tvRt%l6Sb&@9ddPhN?j9LR|4dg?C!IiynSYFF++?Ekw1tedmrM
zB9w@4ka5XGH3T}eeOPS=XJ=m|>Wa!r;?d$naRvJk>&%q{U55x}1URGWc7aj>79az}
z_`G5GW)OQ+!DY;U5zYz1<5kEdIV;W+76qS042`p}4E>(tK0zxE8hvaOo2*<{BCO<b
zk&~wNZBAstBhQ~}(PqMfw1fi7Y&obp$+RC32Dz_I@9@?M{@TMU3@nBq(>;FL<mLg%
z?-4tDG~%FN=(k4^jvgwE%9%6Lz(U?WKH7I}^`X2x1)qj~NO!M{s9u&<+|g#tk|S$b
zR5z?e8Qbpokp>NNU_n6v8kCZJ@Q}VhpIyLo3K)G7Ll=dKF$#q=&j2gx4-AvQGXnNm
zA}Ih@x1MPo5|1k?wuB9tNX~Z_sX%#!0vzcaMg92l#dDn%)F>9+P|ab7@hT~igAJPs
zdKe1uo_d*&Ixo(AADA@i?SB%kVd|5XW|{Qz)8bWQf?#W<1|yHz*zhZ^{gA3Bl!t2!
zFgI;9lAm&FHQ3P<d<hr<4X)LsgB_w4i;WreVPtkw5J6%VRWwa>VNlq!U0xfZ!cbI8
zBHkdz<y+@e<KOU}TmE0MrV9G6wup;&?(>bk4zsWM-0V*xbYqVB5`>0q`I@$nkPzJu
zdQrcf2vA*G@|1d^7H-(EVY`gXby`iJ>C5Hn9FkU#BZYIn5y}`7I~|`tXI&h2n*M&g
zrKN@3tAUvhCh?nvIPmiQGe0hssG_+MV%~JX7hr_>#S3T&uL+SrZ~@6H(57tG=n7qk
z7B#cBvg*KLL|3RBzRmev!O^rytpnVEU$9gX_DKsfrtrjxznG2>gr0()Z7KS#-GCi>
z@1Cy*=>f(OcMPn-U@i}o0_fz|2j#$bEP@lF3KEwV?h0r-JeLf_7Q(L3Pdt=3F1s<&
z0J-2cD(QA<=`N7(#J@;@QFzwsmAL-k`4iBJT=7qlV#(AWcx=*RA;Xhu_`vm>hzrwA
z8*}sWL|G}s$oc&I%}`gW!&4;*r%J{Tad3__<uvVndEv{8YgjUapQpj{k!uuy4DQ&i
zn8nZ|0CI|Unu>)|<Jx3hLmi<D2^hTbU%?>~v_=FT6Po4AFT8iGdMSB&#0mjti8*RG
zH#_KyILK8k__=V9C>t`1B)mCQA21w-<Q6Y#%AD}y32*}Ov2^zK#(z5T02f!Vv9l9r
zHo(Rn{GhFEFnqC;G9WX~_^0}NhVkf5m$&X2GbOp65UU(iP#FoaT439xerk!vlZR*i
zE4u@JTimWi4D7T?{bo<%Fdy(zY(3t3hZ>5;vy)4EaVWKm-A<~pj6m$G3cq?P$KYnp
zMaG&dd*zRVd0~}tv5VtoiY!VYR6w3{pAMN9PIrEw4Csl~A~SkC+JRIO@3w>M@LTuJ
zy$T0e49lU$qUX37$}bMG8CqP>+B-s=rH-JXN9v31zP9OsKgt;C9Zwy!Ks!sWdRV(J
z-2{(u+oGD5bbv6x0J|83$+`_ufU{X31U}xgVyWPdFp9+48NBdLz8G-#8-#?^B@F;R
zao@|qD(9|)CGJw#L6}z{GOr-;t_G&hJX|pYa6}~J$Vdi|EAe4Zobpcf8$e{^uN8-k
z2uqJby`R976JV8d<eh99G|wXBZ&gbxFbs`++OwR${<u!Rmjn(ufraRyhek$zi4pDm
z+4dAG_{cLp77^n-$cGn-J-f+eF+dU3#GL^Z<k+!(y9jZU#0!sIAFR|)Ra!a&wOGnP
zv$CRs=dQmj|2J3PgT-#*phuChk2K*<o@uxEHJC3~yT|{as!9nJY9U6RqV0R=tvngR
zLqmO$@yU1zaAQs|En)lNBLE-s5A}XeoJs>l>w)u8cpC#_HDt)Co12{$y}0ElK{Vii
z4$JI*@_cYX2a+q=eT{HSRKBT!*-c8Y9)ToLL_;8TkQU_9W<seD<}ER04W^?YZIGa<
za!&t>DSs-8VZjkbX3;~JyQOpn&n=>mnJ!c#EvN=<&QmL}x>w$g@1;YhKovPTDJv{2
z{H{=BoVDA&R0+1`bp(9c+1;>x+K;d&7b+1h2^&)#E|7x^(Ib6{fBW`?pIamE22=(7
z5%<t57*_@rXad9P<tGse2xDU@fXkr0)iZQPT)Xk7v!f%j+Yl%9-nA<_PBRlHn;y)e
zySG}$vPfaXPXJfL?C_Xk0*rpK6|>Uk!MO)T20JT5FMw2|w^>xwxR4?*FVDlzPbN}n
z=;>eF<JoI#{Vd@;(Q*I4i$|u#xHka9c03=lF3TMUCKCSaSph^wnwB=_C*|-DiQ)bp
z;3L82A0ayXBe%+S-8#JzzINsthaRm(+Ra%}gWGsmF)vXvJt&jceUuvf3L3H;7YiX=
z@?okq+JV-AEOI!T6gM*4$jqb@xt0ggG(lIG3lMPO(e?mON2F6^AF!?qFldAO3M%UA
zmI5}^Krj0(E=Mz8f~QKvxX=p)SL$hWAJ~kmkO&=|oOHiBtB;uirVwIS+GT)-a-_XP
z7THG}of4X~#ZZL|OWcbJx$cq+3vhK1x%>nvy%O*dRfH&*0Q*(pnVV;)rFSo~y2m}i
zs=z?2m)GfpOhx<p^=nlP4PThLHTLe!58ZQWSCDt`=2V_^fE9>&GXJgvQePXLjh~wA
zO(wz~X{<?eG2d4Q))kndKY)B%9iDQE(78ycA%MChpb*OEQTAarSzyJ?008d|Iozu6
zlOhL({RwXkyB|?pQS}nAD1_f08XqT2uGz6;A-I9!GMsv3M`T=Xv@oiLJaQXWyjw5|
z5KIy~$u2<6fIxLG>Ce4Ha|<-DjMNOqL|W(%So+X%>W_0U&R_zPYIXS4yM-xAZ4f{q
zQ*<Kn=F~#(T!+bmQ(vZF*boPufj3~0@qg2b3%LLz{7|2-LPNpjIW;Iy4!G@x2#`?f
zb7Zn~PJyDas|6SgnEvt2609klip*RWCQiIBTmTaplaQW3?XfW~1qKv>Upb!H>)qXB
zpywCeFfP)$t5ESOWtTa8@~CeoPxsnDFbWI}g*EYFi}scr@N+HsHY*6^#Mn|7NFA|l
z+jb|*dA!0nXx+B*j0i}vijd$a9pKR5ju108F#+qGbSXBL$9hM-4#2SrE>e+NC%giE
z0s$J4k-#UK{{H@m<RzHY619-7chRvok79w*k`+4GEzIxZzO$y0KJ<3w)z#D>FQ1<*
z%G_!J&4*xH^zRVWH@|Bkb4-U1`+>m?j?|DLJrJ?~f$56;<OU#ScJwImNY=5WzP)ds
zn{#K*RoNcJOdJ<miCYBSg3k}k1@Rl^8>dC&VQ~Qz;a*8O@B20b9RNYr#GZn;d;H0<
z@WBJ?OP+gU*Eu<j3a?XV_;dlu5MwFJK*iuqaK(@!z~{~$>jmI`S4*XA$3l#{vzRa}
zGA?!--pF4sLOM?nP@NF{pj?@gopN@b`Pkk7auCFX_B}hJ<>`&|kZD>9j;<m%S`Y|%
zV&MP6PB+Ot3_H$@(csj#);J$-&YuDGxRo3%XosYiUB)b2Mz~)#wopjR+2P5s1Awoq
z1wBJD5aX&IcHr1Nh!=Dmayx4CfUb!G5-p-6i#&)~r)MVzF4>oqi<Zu$LZRc&v>rtA
z1a+2R)n2?XvH6q7uIl$|+s+BVR6ux}f!6$!AXjtQHT!WsEl}tS&y45f{7}VmzrMa-
z6vlvH_6UqKK{BO-{Rc@A`nyS-cMn=zz5KNFYhdW5?_S>306UZZ<~})BzMRH;DeV|u
zh7m<?9Tg>UTl+t-Q3=BZ5GwfHU-I+pT+d)Ygn~)yxNs?fjrjCCQDR=@S?;cr^&IIU
zm!6?X%x6lWTm`ej3Wp_X6oy@n#^aV5;V=_<y&Hz&*d;|25{S6V$*z-$({&FWKhZ+$
zP*Tk>X)9$ViI(7MWYLb^-T+i2!hC`VinCJqQpr+RTw-?d(j`j}ex@6v-+%=q*OIL8
zQ#_B?s%UFlkG_H!Hz6kxM&!iueaTE9P=WP&&+t#e{Je^5b)PqQ*lvU+hR=Qc{FLLw
zv0`;4Z+czQq9ri#zZ{+oKV%E9XC}UL)6=6#oHXEgZ}D{>y?|?y05vw$iUUP8-EkK@
zh$?|k>qi)JNOQZ)%sA1TApiq#lF8*YAk3bl-yX&oDxqJAWfLsJs}hg11l&a@i#x*5
zM+LmXR*N-&^A(}+#S&)_#L-ER2yFL@OV0kd^>H1I%jZ`MkeF0Z3Xp2ak%UM_bWRu|
zp><~gjj-UFL0mD^fo7O6u;63EK~j`qEEnuO8fqd>SA}P<vTe*dkLL?!-_=dZD^|l2
z4{vnMgrAq!yv%C>CPevAZj?1t-wuK+A|g9RriZY_u#l}n6jftCw6LAv-9p%-oOkY1
z_66G%6zS_+4#3R7z(9=FiVTYg@z}hsP$${QU>=0EwJcw@Y!~jMBKy5BX-5#!-Srd2
zt#98}w6?AR#Yi|=k_zF#4Z5tuMkdcutk7hH;}C4X8iIl+t}5b^D6nr|-`3U!qbY+-
z&Jm&gw?YZt%r!N=Tfp}CeU&w?@XFH32oCI%TY;nay6~hW((y{vPNR$Jrf{9Zl&FZ_
zNlAlWjvx;{K5pP(WI}!R=7xq{=pYe-{1(z*$$5r(r4#Y9i}1^6yqgjN0+zs}vjN}E
zgEkcL;!Pp0U;=qk2@M)c5PAG-Dg2s{)XGZalhuMb-2R`9s@z8t0S?o!!ngDDD{<iE
zPtRflBqP-y%0fB9qP`eEB`hGI>Z{i!h$~f)>K9Rv;7Du7^HAYw=05=5mzonA8gztf
zg<s9VM%}8dWfOcfV`F2me}zJoY)Zh0mW{w{1Qc4rW5T|{fihPK%ZEc()lzaP1D1jm
z3)mCTO@^(5@19IS!-#Sfg$$Ka1@Ayy@u0yvaoNdA8H*(1hD2GYK!m(1a<YMQI_q9>
z3CtYW9pT9pUa;BXf)9k0*1fORPM174Ny5hpS%I02jg7c$4&OlYi+O?!)w%*waEZ)|
zqr!}4P6YxOCIE+UVMYav)WG7*u=Wz-4IK<yEe?_vP#@_X=00rMNlRRK2(DddTo@KL
zF^$1dX$G^SVUc6TM-H3VCY;$I^hY=zgE8-}E%WA_RBQw~B{oz-W8+t|Acx>pp;P|4
zDG8c6;lF`lcpmz076lgf2JdN(^xIN;PJWRBd{n*}8N6gI36lY?2~r;`oPQJwNFRbr
zI?<a@pdd=P+9>r*UxI+diGYTaf%NQf_XS{SIWbkv#l=Og$3nmey6A;Dxw_V(_Q1*E
zi)McVjNy(&Y?8;`5SQF9mqFDIk}-hth~WZJvjp*|jB6TzR0ENnQc&XI<Aa650Lq+K
ze0=lnm)ZhM;QE#lLKWTIa^6)0-@#{mx!cgfA~ij;uH-&=KH{=N^bmL0D@f!-Lqah^
z!?6p#^C~n9WZ(|Y78tbL;ye!!8W|Z7w|#P3NPkGp%nZPh1x)M)Ouecl06P|P+|b9m
zRr>957!_A7Y3Dlp!+f|;7oFgMX#*@nz&A@EeCEtO!kt<*;p`!>t-^V)2Y)SbsX-3(
zLt)4{VT_Zv3M^st<?Ja_;E|n3l|(PX-GXIsk0W>ZY4EL*xoCiUG}rG-mJ1^>izp?0
z1;m8KATh|)iO19!xZ5Md{|ga?tQ>4j#JqyiMeu`yseHa&XIGauWJ6-o=FVKLi4qQD
zgn&z^XpLg*qI4M9{&+jcr5~_!1hK1S3SM~fvQ09B6DUyyjBB{x1FYi5x0iL)*F*1t
zPkJX{3{i$nTtHrnnUztVVd%f!UW{P3^mHC_jTjt1*sYzg^dbb!tgZdgmRn+mpCVzy
zH}LMh=Eo}2b_Bqv`z=nIXj7{G&z}R)#1K_uXlRHyZ_)T&&CQJf%uNIRVzHa2{wP$w
za<IJm%055#jFKt<7Tp|G>d33B=Mxx}FJM9jx<VHiC>TR@H_4(=VgEx6p??ix*LD7j
zn+<S-!e!g7_Q=Ls>8gyRkpxmokT^_>bHVMXI1Dog6HuD+Lzk}rzX8T)gF&(HyYtjC
zCMJelPXQ~e^|}zWy$B`eo|<h)(8MQ=p%)}Ej3=d6{tv82SHN)Q>@>o}?)!(Xns*g;
zs;Kze3f<F_l9D>yw@IZ8bQpR8U1Dg36ir1TiZkT4>t`n5Dbk2;7!&-4yA{2WTrAM@
zx7Lt-N4Jm$)q<O+VLfjA0)dWy!x9VzM)$)J94S^Xkvu1MMs0%QM>A;$VWHx_HfUav
z_9LeOG^up{sj22)Zp!LvbDWOWnrr*6(1H>tQyRWjZ!6)$fZm;xi<@P<iK{O`^(^W4
z5YI-|FEJ8A1aag~gXh~JZ42W8xeYY5_a^OF1Alnk@M-JcF0!x}1I3ROW0e}O?_YuY
zVJtT6*SUmCF^T*O%FPT9LhdV}rS-;$($*92vviL|m&PHGyU@E~!xs>u2Ba%`B-m((
z<plyfnU&~ynJntE=;nr|Z8~4398|0^YeZ&X{^2Aoe0EvR78akP7wUKK->-#ZipMn2
z(ZPWX<3QVAh#P2U#u_w<y7uLi#+OGkvooWbp&asZd&p=)vMgk0aQcSzep$$8y>IOv
z0)rD5%*SQgu@525cECRM-^^mw{m6buY+w&#&%(Qo?F5e{$DGQOibjLT0kFR>hjeCW
zWOVpo?F&UJk6<hbddMNzdP#D4_DuMx348WBKpHZrOkowbSOlI{+;z|xS~rG)Hr2}b
z;#OsXoyc8OXym0tqT<fvAARxxkO|6i;xA>*5A!9cG;-)-bsmL>t=YU751@aXLTW2Q
zD{zgbGguvz$QLJ{)zqlm{ivRU0Us^kU$$D<n)zDgJc&9@S`vT>^gW&h@wg`d#<$eP
z4M!tj&m^-_Ik_0-4j}jpz~2y37_pz@K!>nsEyJhz_+V_X^r4uo3>Lrwd?-cY+hQoz
zl22D%MJd9oo$8M>99FGGd#W0J@S0~Z=xy6KF{m}fT?}6g(I5!j<&eG##$en67lIVh
zV`vrS7$Hs}?l(S-gsX4;`bsbHI5cmbV*SnG=-suN+64oMyU8x=mluDaaFcGC_^$w5
z{19!RXR4NEQvVB&agbcsP3uQR=EI;m5Woc!_j|a9YHM4d1A|*uR^=@L(UaqH5ZO)i
z(py58vgBMM<Y^VCN#bFJssPgl@mzE*ZlnTxRsk*gw^9c8Mcv)pxL@rnPJpnBFO{Dt
z=a?WYd7yldNIM+4p2PsY6Y~myozC&o%dI?0qRw#Prn<wpD@+_QNDXsfUUy_Tt^`>R
za?6jjCy95-y(yrM4&3#V{#lB$Wp&V>?=X~L<@OSf*Pu7H!2b&&o!lV{%rN|=X(I`K
zAO(G(sXR!DkPd8)@_B%~xpCkwEhMy6;#b_-L?stS_SY*v88|lMnU)N#?kZfeFceZp
z7A<tw(TPrgmjNzK;w?mHOpq=(5{QJ=xH>Nnw-;P_yEhW7?u5g2=PAc;@|-JwHpA^l
zu9YP0CG6p_iV&F=V+2C~r?E2u%X#hE{$0vY+fWL7r;vn1wXqQ?DJ7&RnG$6tW0@&K
z10j(zQ$j_NA*2i?nJJQa+$y0+hETn~RrWI+$M?P8*YO<t*@N!k`d`<&);iaDp6kud
zz`{G6Yatu~h`>)#7aAK_xRfdsC1W}Md}4RHWtb9Q*Vl@h0OLSd(c-Iqz)@F(3q&On
zivUs_Zsrd2Y_f4)YMW=)r_tD{Q`PM3?8us?bP8`W`+rGS+C0nmq<|t**5rMD13Jm|
zdPnb86kV|KJI=Go@gzML`-1U*{!WjJj45v0!f)gZ_>{UJsCxV4OgFi&iI3vI4e3WZ
z*9zh57Ct(8_%Z|vr_#n<HJs1VFvui%mYyrj;`n6W-n7DmH%3pH3#ox;mO;nY2#P4r
z_R__pP*6jb8I`g(6-6R=f(DmIQwY>@4N=|1&AZzA`{<7!KZ-g^jtVI>m{LYYMlBNl
zz|*Hs!{4G=6uT{;{Ds}BhM{;v5y!lsOTTN}{{csB99PH#JumXs{NH9PSoQ0^0}VIP
z1k{q|Gs$fmw*{xcv}xXme2wMgdafV8Z&68Pw@$;4Sp6OTP_ohydR{{5OiN3<pVS*m
zK+U6*Zt4A(P?WV;y}_Xu0Km;pr%AZkavo;&wN)sU1i(c3xZQg?0=@3|FG^_E*cjGH
zsI2UFu(sM9GvkrR>w-`q&9PldsZGjy^m6-5$q)Ddd+F-P>qv`ZXY0?u*R$~u3qWy&
z69XaED{e;Y&yQIxM8_ZnCReW~lvh}d3F8+ZtzkJe0*C_U3gT=0{phd%NTYBn%3Qzq
z1lx8Eh8!)Cl@Tz8&{lqYx|WWPj(B9!8%H=L!Lv>Iwl8N-WClI;XhA#)8}ddEB!H?x
zlG2eQVqWI-S}h+!{O8G_UlK_LV<pzf*CwTjn`aZY5x5~lce&aY>gnUO9J7+h!k&|}
z{NZSm5(5+sHm|G*H}TLTD{#eMS<m9?gFld*Iri^=A=I*Ri+7xUx2Jnj9q=(Glzjv(
z7NCk|34J^8Lz<|bS&efEXW4)yR9POr*U2ZiC00d6wITyQGUbjT)nzLK1DlOCHc3+F
zgV7f)%;FE?2i%8md~to-Kj=f!`r1p?0RS}>=y(CjF_^an#_RTh7phUjO09(TwGIDx
zgcg!YojM^~hBdG1Xsu*Lz_hWCegs5#bGPN;6T26VfW*dA9?KJ!GEYRwTpt&}u<@e-
zmG(${WYuImF<YajvbHIW!Yl&uoayftVfQV%*;Otv9nO~L>xd8rx<JtG-7Q44L?`Iq
zqNN?+5(Ur`XwG?2s?qwsl{`%^JBzeKczbdo{7W$W?+%*9fT82sr!W#A(W0O9T|i}&
zF28%NsiSl7%H)w^hAJiF$m5fjvk5XDKKuj8bUKNhz>bJtM1lbqbr2q5v#>J-!n8y0
z-T|0^|8OJu%m{K`GoL~y%%_B_B9Yt-?sC`=&817FT?mXvfm>h5VDJx4v|f&^P`I)5
z+Z>j4GmJLrN(!@<OCOy9cSq9Hwjt7;U;O3W1qh(XSPz@;?mW7%Z;QVizTl_E)|*vU
z=nNmK38h#-WMrf$4X8Db5ZqzG4=3AQDLo(1i*G-ENJxR05pOng=FE4GPY<Pe!sdog
zVUVR`#05R9V+clmzUv2gssC?;BNoZbx5y<G&H<SvT!^d5<DJ(GxY4@|^7H7(*$C>%
zdzBRscw7g?u2$10ix5jHzLUZqKtZ?<VgpmHX5|v~ymUv6E1WAJ6JncdUQu7;yV(cC
z>)a~;N#fV}>y}r|{PvKzY+Nw6p2pIF&{MjH6kAGtURLIl<h22x^gItUR*pit&7(T0
zEEQxM>zMHmCA~plY%ac)WRvkTVue91@5CZ26#R{=_x_e`N)8Wjl>PRz^qPtEFA0<?
z3UQRhvwqvY=jpHSY!!YdCrgN(6s4z%7T_pP86CH1G-jy|i6eUSmxw7j7o4v;NFMzW
zKXcd91FE-l>C!{)XPVICm&@D7r;AL#pjiTZ;QHcuMpLII(S<_^o*l|3?u$9ghk9*p
zxarDRPfum?qWXq8z5j!pSFIQi@)?LSCGlK0*pkL`^gjiz2KST$KlXw$P5-_S4T=1h
zD7Rj`e_vlADR0^5eP7LlA0&83wf57Uyb3P4eoT%YLZl`=SKQl<21$CWQqxk|{i+fr
zK2l?-sHi9u0!d0Kx+9|SZP?Qe!8UwR`_HW=AseCZaFMSLj^YHrBv-dn+%^EW|C?xh
zQ$qVr&W_-TZRe@vjN|^;Z9F~#?Joz*nk=eIxWgGL*QT_ZXzy#YlVtkCgwc&uq(Y6M
zVF^KaHYX64^LU+}ewDq9Tw+^V8@;~+h$?v#`w12~BGBviF1mM$3<-A}4R2S7y$FM9
zdw6(|$hU$C1Rv0vMxo>~5Y#Ba2fV90@-Rd{QKN{_kzZxG-HF9x_;Z4-!dX>Pa9~@`
z84`UM!{^nMHqJK|{n@#51}MA;=}3OBJR4WXpK=pzV2^7%jeGX2rVW!Yl8apgi~;_%
zQGLQekT{A1*un4XQcKLf&=wFiHPoyN=-w4p8#CL{`@3mwaN_fSH^UT2<IeMZkI;I|
z^$Piue1I+%RmPPP!*9EYm70wk!wuZ=+HfC^_3Skz!nekqutZb3J>VIbWE&XuDPLA!
zQ+u-PEUcfwx8kOIfs(;62`|$z1%(!U%#0b*Pt4$>x>zW%_g*O=Fn68)0R!*w$PvcU
zUA*r$ux|={kty-%U;PGc{Zo}aDFX)NewMBi_nm%>L5{cZszFzod=Nb{a3<9Z`uJJ(
zDyu|*!M$>kIM3cWLWdtTXdH)o7U>?iDEXi>F<ojb%3JZgC%(~!Qox{}G^wEZ$sLi=
zl4uf@M2Cbq*0FUSDL(ztf)<bfo?2F64v4gzd{o+-p)D)}aW8IL&uJGg%H?<=JzYQV
zmOft{YJ2Wg_rQgV`|3l@&Ks?>F^e*r)M{J(m<i(8K}S)sb?8J^@ZcXk*&g$|?H!%w
zk^Z6b9AOB7<6e?Esek2OA%~-)n03J9do0<~#0Qyg`*N#7kFZpX|LY>~P@Lgj4ee(9
z%>~lLBrFIdGZLadDCzK3-*;2MO7KE7Z9tM)JHR^Xj}A<=m{;<&BYF`M!v;kBOq3-M
z^_3`0Mco0H!BeBi?@RHM5Ntw3d`2UpFE}(}V1TJw?D;4vDapsV8U+QTA@>p*GR~v^
zMV&7rr4bjEu*Msce;_~*k}D;~UJkwBO(;cY!S0)0z&W+<5aOhW%-0aH)PS?MOK)_0
z_MFRb^3&8Fw0umt0Xdjfgdw>;>9$^6Z$GjqqvH;){!sE2FOa?w#S`(!l&%0bh-Pd3
zH`sTIKC0l9_jfW`mjPZN%0g@%l-1hE6w&?)jt-b#hXlIqgo$+AoOx=?`KGa#IQ{-u
zg{|J=IGWxhJ&NJUx=r6ydEY;xM3y7alyuE~-rlPZ2x#x<*zer33PG1pcFq9LKr?-4
zYMV<zI{g+>{8L>CGk1Yf|APTc3gA1-ArMz&dLzPZfPg6ArSP4j`qoME%CD#Hta_hJ
zHHk)^V(Z(Vq%#Mm`hOimO9c#IZkoD@Hu-^T#RKXi5!$!#`?(y7-Y+?@{{GhPNu1?@
z+yCnLD10ilqQrsD6R)4=bK5%DJxnA^5d}Ih{Za4jXQ?9Oa#V;Ky||fOxZUoaug>u8
zJ=z2%LY2Gx*&-CvYrwrOJM7QMcNUvlR{U7nf&JIFKLQ@`CoZtM)jkkn1d8C>6rkeD
zt=_#TWvI>lS?ft1rb<L+jrH5Kc0U55BPk$F_8W6jxx9o#g>w3awbtre_wF^}BdRK>
zyPFC`x^Lfvh`Bi6IAON9vpgU-oLe)B<lYmLu1%cXx7*7ABWa6F?XK}glP1|Os&oAN
z?aQy9`9QMkcH{aX2+Nb^HhP`SgFd#T;%w-pppHW<-zBvAY#q*Z2g_i?tD7rChc05^
zxZ|P_|M=lWU-6rXbv@Vwvg)zV6SwwSc+~#Q#Yvu?D+tXBD?q2&B~L9xxTBD^z_jg+
z+S8pM6mcW0cer%Py2MO_oWdUqX`V$M7&Ww_?+BzVf=cddd&6)0<^%u2(IS3!vf(9U
zKam0}#^2OJSA2<+3v}Rqd|IC4->YKO!L3#TIP9hSFe2HsH<ghLEdkRF-m?DqPH5Ou
zy;h3|fh-utZy#pa95voOZ~pwICI(^a>osiXR#6W8SFPB?85>&_n69GR?@kL$itK9C
z#$|wTx+OhFkL_~U3#KMDdnQj&MtX%cUh|d~Dp0+OVl<+&N2fq`JK4A<jMQfN@Uf{c
zzVJLj9DuAjw9>La`rDv*M3<|08H+3(2fvEXkyAC5oE_saj%PppegBg1@y)x2tvHk4
zUv4$gx5I7zO##}V3gu1!ec(bHZdbfsJI-<VSoZ!=Qy8-GT_3>mzd-IPq(Skzl&y4f
z#J<66)~s2<^#ymU9*2NFQA-SZkWxw!FTfJdWR32oVEc_EDYY6l3~e51Uqwj8<mPzH
z;+8P@UkZbW96JN3o`U)pO;<8IY0rKCo*Fu+hjfL-5SdRe9S;=4b=B2ZT1Q!tR&pWd
zG@t(=Vk2JL>QWxTOA;ppL9=;|QWOcQOXPjsy>(bj2egr&3&bcG2%jRfBb(&V)K}ur
zA-p;fF|u5p#@xSk9ynE8d%*H+!CsUU^o@fk(rPzo9!$f9R*_@Qd}WJXy`&+CwGwt_
zx~M2cYAp0%7@5tn0X|ppW&3@z{&v-^Y<xS(B<XuRfA>xdN0Pk48VHIr*s>z#VNMTI
zQ>WaP^zn;Qw+}2+6|9{z*u2D`3LjU=2cgqMmyaD;NN<Am>q0Y07$)pnf|45&NlA{R
zA<9m3J^YEk+7Zf+o;bhf`pbzpUnN&=9WoDswirGNfUPSXDJFQ;CAHGn6!zB~AD`<-
z7rj|pZg|p*8Z&3QRo9OZfi}JTDw`w5QBp7yMzh<D$5yn{Aq#R4mxH*4=s1l>A{R(^
zQz2|sIammZl@u%ylTS8m0dk}QKKj9C_j?eKLJ%8zQz8%M=pS6<EicO63@@u+E81iN
z{1L9+{Ncgo5Ya>Gwh&|?a#Ml4u!xGN)r;;Wh`*N56b%ei=#72Dz|&{qUn}+_GK`Pt
zJj9T1^X>chm#|vnJx#i`ih4{^deSO6TLjGXo?Lpm(Uz`_f+P?$*#Ek2TFhB>2b_4B
zLZ?j+o9n*P3Mq*gE44hP)b*spfm@7Ms+YUA2`L#kOsg6SLVixKtGOh741rt0Quigz
zeR9UF>#(#%zB{K{^^aLsyP@8pk;-@IGv_Af5qxbIEa*P*=8pE{MoUh;Yi-i|-H5hb
zxiRF;NG4K@J1-&rSNDG~u%Jn=Z>W_AgKds3?gSWv*@E;73BQ?+&NS1lUuyUYs){}M
zHi)Uj<<<6&wC|3|q-0(7e<I2#q}O)EuY2+Bs9LajXR#5O4A<eB`Wrqy?kS}P>>jyz
z`A1<b$zw;UNTAlG{V#ka#B8f|HyytVMO~0gO6&2Davs3Yhi1WbW7LcQsRAkk#cLd~
z7;$${4Fg=(C*#qhQ8Yn$pf@Ky@y^>kp@ql4dgDMZ)xB}SlR{ui^10OMy>T)EMt5@4
zUd+%nm__xxyu4)224cl5(kp`pyYEpOV+z4dzY<C&SW%m*9&3Ahpbtb85CX`M0YicI
zD7`3H0shhi9JA{PJ!t}n@ECYPzxC{S#@HmBQenuA^&8`9c%^eOlJeMb0*<=ul6h}w
zCxUzP;Cis)8Dyu<07%9?cZ*gNER96KdlK->w$tA|I8>X;j8&q4<{TN<SIlz$UD13V
z^n=72j@zi`+vhgD%eNQqzy`RquZDx~{;e~6@D@`g=$PW7B)ZkykI*Hb6KJzUM~Hcd
z%!(2oqV$11N6wa1ziFq@R%|Nr%XMV4Y)!EpbKSA%nPWiVInIip2+;5FHCF}rgC>W1
zLE;DjF7e@xfseG`xeXiA9)d6}-G~Y1g^O7iQLwb%`bC_Xs1GoEb~#XVrvRsl3{t1(
zusXf_4FClH2r)Mdq%hXwTyMfWk-Z@Zn$qRz=H=We@|b`b^xGO3bqBiTR-dbT>+)qQ
zxBy3{<#nQ1m}0s$Ut5+TgG)}-vEp7tG#{dC$!&|=Ut0YdcUI3N)8iu#hYPKc!G@qy
zvIyc_z_aq?zv}PPa^p5+b!sWLu6ukK${}@}R45>H0rWi<e%gPzeg9FaDk>5&_=>bd
z{X;`Tr(VYnh+BMot2M{(_>mhkFRqQo#>tVut?*6{Cfbq;f;!KJOiF-_oSD;ZFaj+m
zqr7OIMY#rTe2+K{z^DtQjVuCZs+_=<b>V6D0GF@Rr;!-3C$-k>uhuPMe-(*|1_%5$
z>?~aN@wib2M-lUFS}@XTB;lxaIW??WYu0$$J%Lra4XGT^De@K7hQeQb2e^T@t=;bn
zMRpdb3FzM|&>JB}(yW(Ga<J~Y<M;p5v|`30Du5c(iXTg`Aytx|EVM2&rjaVP&g{a4
z9!Ll3B^+~cpr%Z+b?pD}CgdC9Pmi0Udk}`7jlONC-EpLr09r)w*+}FCcjc~@GM{Tb
z1H;&|+%y0!pvp_4?xsnSdvj-tjdMaNhYuKED{g-fc_4=7%~+wEIctAXIKt`eN90pd
zaMD}s3>&@yeQI5H^m$-KN%K*AHb#kkL6krwHH8EHT4>q{f|3>j1Csd$iPhw&HFdYn
zj2G^%f=Qv~Vr>U}9|b5%vojqwUAnc_(j177gqDV4C<K-UPnmF>Nt_pFndwhbT;B!9
zycV}}p_n%_vwqjk#-NYn@|<>Y!~nmUWmw|Jgh>yO25*WBkvn5UC(aAGYm+$H`dt3B
z2<j++Q`%nJsNcRGUk{?|ZR#=JcITP=>`6(qM-dj_7#QaLjt7T3+={w!cQf#H^(W$F
zfc;zLcUlPCDm9s64^1e(gl1>GH~yr-7&J)-KsFcCCS^_|8HP+2p!88ad#mo!vmjTm
z%SX5G^r@x(4fAC`yQ7A3?N$)nI|%w(wPcOTs_N&bhO##a)8d~;O*-szjBe+NYcKcD
zYz$S4qci%{EF}t_l~NGsmifeOu-cXf_jDjI0d85abyXE~E|mrIpcFO5cr&b;-Dsl_
zizf0}8I$Js^J5)Ri;79i)-(Bt!R$$_-wUA`r=vq2=k&O`Wo{sf68H=e6Be0g2SLF<
zHA-7!N`2%NU@Q!+_SqtSRk^AY3SPb1yRCi%dL^NJ%)7lE#21`O>;%JxZcJvHr!KD~
zT0PM-DE#T^qys$=aE1(uxveT&AM?KdxcJmTQHHE@$8IU_#3`E=v?rU>=53%5`K0=m
zvi_s<``A2;A*+^_0lo?*^R%Ls-&5s?-_`7bo4SY|bQ<dQf{F6?jrQMTdN2Jh_D;g*
zsmaS24jDi4$=(jkLRJ>JD^0VPq}(WzM@6`o@?nK7DF~0SBmc{PlKBgUeHw|C7!Af?
zUE96l2*cNorj<kNx}01K2^oph$?h-gJ1kth*l*a!zacPsn63LV3YjJ{w%H)U0x(0l
z5bh$yp#s&x)1Ui87kL~q9ugCL^O<R*@W<ODg4=lriIpQL?C{g47bfeDyZ<UE*uIxw
zk(4TMXLoqNgB{g}6u3=V>*fkXLCbsq^{#ldzB-j#k@+$x`})90medT;k<{!t5&|-o
zfY8vv5w3NH+$D=Kwull9>O6y-Rv=XPaRwI6XO7%d8k09p+oo*YR@gxp3fwcq!LS;r
zh~5*a<D6~Auj`%kHPp`;)8fq6ZP4r49`*NUJHm8s(o^}(7F>S%d@dj>k$O!GI-t?G
zWHClmaw+hW)f&#zq~k^_RQh`APB9V02ub6_o{PBZ3}OLjB1Q{>p85LbZoSuXhlcH{
z>#yevXh8ZhZsNrKDJwT^*-{ri1HkZKkIH<o%FUSp+F9KalAbIX?XBbvuIRhSm_({i
zA?Chzi;e0E$aR7U&74^>9Z*3x10-8$bs8I|gp%TqqtFLjPrmPLpM}-8n9kyAC)15T
zlCw$sgpK2AcdhYF_LAVmA}0~+OPLG&jgVrx?B@dUoS{b3Fl$9OerY0CMQd6dD_UAL
zTGc#Kl|Gx)>s;yj26ubQ{0y!v84xH(Stzd9&on^!%`-|0s{dNJ^fi7sZQ8VAd@D5F
zTR}RH8ZBK7YAMs2I0zUB;|+5+ZtFZbUBYNHTlVvtv28nr@T`Pf!xuOv-sLe~I8uc0
zu&Gs}g?I%wLq@&5ZVtf3K}U&W9TWZ#{9l=X)DM=Ez;W#OmpoL3#3Z@SsVW-xXfx#?
zS>{+W5D@+Dtld8IWyOzsNbg46Jy9w>zw}pG8+rio+f=pDy>fOW9&y@NkRw8*&0N1_
zl6+w9yDRsGf43kKV(2!xJFBL#G!9<i9-GygB5y?pCqqpMYHM@{OuMnmLWiFFeZsr6
zI?F74fI8Vbwgc@R&Y;04E(+v-?33CgOVYu^NI?z}j~a4Z3TuHaj~*0}mwdnn<ddtx
z`lR(NcL{DLM)M3#;Q6%b`c5AxDroyEjBhw-tEo!y%KZoO08VsPx`bT9K`cy&uI<5{
zmVSDeMl7cdPik^>?SuRGX_AhN86Tq#{;q*M7*e`WxPCcah}G|#H|^*3N;tpHyKulj
za!&a7-<YNJ0p+VSO5%0PaMsv5VUxl;OD-l6X@EZpKad1N1qWt1-(0AoL>E>y`HhA~
zcGIylBL>5(zvest0#7&!KtVdD_J9+ORDiibRvP@<tkHK-ZM$VPYH0hQ&p-^+*iap=
zy<wH)+^`GusYdAargK@@I&SUuXHu<3Q)&^!aE+<&8t-y}tC^Zj?cK9;Er16_(Os>h
zllinBV=3(+$iRteXHZ*yhl@`{vAW28f%Jv#5EK*?{Ko!KCKgLX)n>gGm~v#3$W3{2
zN(wo?T3QBktlyWC&Hl1R^t5ZGJ09Q`3>&conY$RfBI-JPKdE$}Q~tb1)q^Zz{Us}c
z-a~GG`Kj^gQ?Ol;)2fNP%i_ge5cNf>BlU@35RfF>xu+ZlAQ-Stw`UW%oyFNuyaIU^
zkQM?UZ3s3@HT|3<jXvfRkR>r=B6h`13>A4fHa_q;BkzB-4*~m(-?kb15|Ks#4h2YE
z^!TyQ)P;H=zx+5a&w6USJD&mIa2!=?*Y-sADqbm&DOJhgVPb8uZTCI)KOFZKt<@$q
zhj^n!Ss)!2QCG5~ykPDM>lZ)+Uh!|##)lY9Px96Tqm?*ukRXc$6b|G9LYW~kHpwe4
zAfVmGL-b|~%12|XK!$U8OQ7s)!!HxBY{$J|92vbJj^P6HwMc@GM436cd)SwHzx`$d
zvU7wGZ=8h}MSz5GQ&U(kS6vq@_%_L6z2tuM3+c7{Wx5oN3+0W-stU@_VON>-N>d}d
z>csq#an2FtI_FE2f>v2IQq@|1{g{58lj9q&R?&2bw0c-qtC6a$yVIQnf1gbcZPscv
zwLR2J5jZj|c;Ub<X%{>18a{P^Rm93Y=VwK%Z1CwvhVL@R-|Fh+tEO7|b-nfJ`j4ld
z=6xtyY;o&r8~x4Lsb_L+oDK;vzfOe={mY?1kUz_cH-n8F?|%OK^=m2C-t1s?N=r*!
zbb?bh-_(kV8PcA@z{$m>Oslbld2w*`q+wKtw>4NIdt3gCUWy^JK~(#vBqMwh&|9jT
zRKImZ1!45iQ;2%y+%%b9IcT0R;^=iPw>J>WB_S4wD+%ePwVl?E#KcbGEJ67@Y%a@G
zvazRI?2n?W@6V5@9z+(cSCZ%9=iPPK(4i|gZqx(@{JPaIp_f8L#w10ORe(>4u{E<j
zUO_$O6I%KDcBP@YYJGqAyCQ_c=+orvH)WCwSP#M2zCkgf;Hb?}h-$4igLaXy*ucu_
zbElFq)H!kDgz(JpEJL^1-p59N^Yc-kf-4I!FYE(@S?D_P@kZQZFZM-_v4+*k7i{e#
zljRnv&6zt_*ioYJ+Uzz)!+iZ7j0Z(5M!I$`cQjUpa*T+EY8%$3Y=?#C;^wv&&CzYL
zZga=T?8lGS(ye|8aealofi<|<tgprHXq(9kT0=YG$2u!U&&p^sLvKb`=3h+=>=_-b
zkX%`8UY0Lcg4Vf4gpkqE%9a&njm4yz{y-}2Y3nS8|3<f+P6E26^P~oJ7tjdkMZKyU
zfA*{%o>&eAd{k+oX}66oj;C~a^2K}1;7x&nwE?w7ECOptJvJ_GInQj&qBm`XIa;{D
z#;Ce*fW?N;T$H|rxo@kgPJDa5vZ3?^mVC@?N>Ui)FT;Tgr;w%~b6S4eOob7VL0tIb
zIKBz_TpFg7f=WX)J{u2-)B61RGrTF2U4E4d6<^mkcxKuA@vzQwqj#-h&cWj!p;hN9
zKbr(liN_v1n1TB)2!&QVM5!7U^;}(D6NHtslGPFVD%Gpo(%4S<hy5x#^sfBoWpAKK
z)qiDva0?nS@beh?t8(<%vDa_Uyqw)II+2E}4h2EG_U)Y?+K)8Q<RZMK!c+{!<f&7M
zrPgmoFI?E;Rx$%ieCHlcEmgD8cRnbyRs{o~C{m6o8d=;;A7yqoz>p_Xm!Tw9wDjA)
z>uBu{Hg}WZ09cJ0r6X!L(2t9rNw>pdbxuFaI@R$U6Ay5ZQBpV!9oiSvoBG5xJaMk%
z>|h=KEG}&wtEg`2Bw0kOKS+MnsxM-_JaC#(+R?if7x$pSjC)voZermiXZZ4*)<ws6
z>}bwCd7gb%*zRt9!5+w>mc@MPgOHMb?M0LwN{=78H*MbhP*Je%CCk~X6(~ma+qSKP
znd$9&_X0uA3^eN%6&2AdTn+@}R2a<FR@Myd&8qdcUq7kZ{fcwgs&nVYxw*Ne{jT;(
zu`l}uptbh<_eD`&PTiUy5C91n`C@_s2p}EEbL`BSCS#A@)VLC^>)2*d?Gwk3KWBqn
zzJA?h!OBcB^ga9c8`c~q<>BL}jG8;Q^Haa7rNZTg=gt0~JYz<ny@3#jrG?~nt+7QQ
zjindIZuOQvy7UjDUO9CeI${Lo>Xv}9`lLIaNzKFxi(>Z$Fe&NKyzNypC$w5qR}AZj
z;Pq20zC23E7nKCC<|{e5Eo(NE@zGw7;cv*X0Ot_45UbM9-Cb9p9}pV+YGy}v&-Nl)
zw;eI!cMRjkPM%y>S64S+VVM0Y>gDA*&hwpy4XaJ}1<74=kog5AX<wzPV)%B{USGA)
zd<`Xdn#-Y}=k!=@vqJnBfP-9S8%aPJBFzih5<*Iu5;g%!<gF!P6W|7k?;X<7*Io}F
zK1A|s)3<M}vz6c9POAJ}!=eG9ZJ)`YmPPGIY$a0>co$fL{wn5khCYWJz4-25li!$(
zJZ}GH2R<-N&vvaIV6@Ai$q2o~l$1Y!V4cmoVWLccI@n#q`PNaocu}pLx_!FkS2^^}
zH`<r8ahI^)Hzc4dlBS`q$FsmVG>W}fqC$5ycXM~IALkPPlFedMW{`Bbbe3ScG*8A(
znv^%SQ^$@8Hwx@*YzB+H@uMAga&m@jX}a2mgxI$&qt3)dj^4zfBksctH%T{$f2X{k
zHHgm2)W4P?EO#zp>)Er-$Rx!!$i>A)*l|+!QrNa7W=BtQSR4>ryq2-Lvr0Bc;>9M}
zl~g~hi6Fb6v6pY%vctg#Trgn0OiK$bB)1V<kJL7AV%P)_7V9jn(X5<^zr^>AEvO$>
zR~%0`r2tp~>-iEE3g!}3L1A8v8m&2Lr|p0N>UZzn#dUR^1aoc)O>;eoPlioKm=|;Z
z#>zXcZ-D>Fs`Y(sY)*aEPY{5#LkD#RtV`)A$qhnXsz%@KHyX1n6@9$CTE`zen0;Vy
z3xbiFE<@sicbPUB5gZ(Q=E}{Rf#=Ry`uO-r1xAhe0>Czi$0|vj5Ecy$`xS1{O=rH7
zpXhf*%uv`&o=o`UDN~XZ)J9JnASJO749%>=#)TDexd6)atmo&Il<xSh-*#Jx_v0Ot
zNy*60hQE2dxFk&)hzI{-UZiu|@e?LoKo0MjH2j6?D1~S|)~r!mzR_2&W@OXdS5zEo
zuxK3$cw)0l|L(8dmhD~CebAsmcXM;w?>qx~NL$%Or_k)zcq5BlHu`<o=$miC8;(j%
z9i1tIUCnuo2GW$CFk#=4fd-mU;Kb-H7Q8=^r!p|F2P-AnW#}YS8=!zjJI^4-q?_y_
zn>+TEM3TFCc^PPnDeDL$)`-P~h#4Or|5xbl`L_T3em6*J&@#M8OVwUsMqtIgPxh$l
z(2qr%g(Oz0PP}@qUcD+Z7}5dg@;F+mQbIR2_Y7J+{?Dxg3meJqw;4Q?7)C8M3;O{c
z9oA@#0QIeTo)D2Vbad3~*RLPtMfLJ|LCxD|cZO40OKvAuS8dV8AxW>-*s!i{2;ZSE
zn?N-8;o;%uk(Q>Ur0|+6h8qWN8~L?+*`=D{Oxd-7y8&P{AgI%>h<e2{!tGN=E`sz@
zR@Ac5*Em~Mxfrsr7vPC>J7>*0a@(wD%;NBC@}PrV*hZ?H5w)2z!B2At3za<c*?O*k
zMsz9CS1w+|_MZ)dknQ1U5><<L;8yI~OQxT=U!h!*G>GWf$7r+B^{rdCMjbigv}gRz
zEZu)iB6A<k*MmJ3@8f8+`6a($<XBe7mG|qXC|#}*>im6#_njG>OxGQS*&ip%L6RTm
zruk4fMM=boTkvXKH-&=3BM|^}8+ly&S?TLvk+;c@Udzo6jA@bi!hB<OIXrb0M=<{o
zZ`-J-K2NW4@ir!p5MqXkW8Un?8k~vhpS!kK#{WqSWCqUfVy(b|cH(YvcXt;T5OZ^L
zYrA(^#lMnO6Y|(6PX+-|FK62D&rjlo0?x=Gi__<GJ8wrOJdUV<vK-Ub{)OE`*VT;u
zj?Gf1u$VC;G3PWHHJH%oaY!@*ZQF51G`n=^A`lXxHfnD#jeoW%wC=jlAGDe@F<RT!
zz~K3&!M&%r%I7F@5HC0=q)~Bk{R`WLuCuFNgxD<lchXlDku%mk=PzHb3wSPlU$BJ_
zH+z=U)eU2V4cfizyLhyy#JQ*eN{9keKg@pI4csbJ9R4|B*`0Z=B1|P`&*IYF03u4A
zcOf^|wQw@)_MRs0^!Z~9G+S9%xZ6(AL+UE4S$3V<`x4#CS(QH*N@^&U1?Ez?ccvnf
z2VWHS@ZlRwVe788k)2z<g)@pJL3}h%+}DBYb^BZbFv||JJ3M{2x4Y{<d(WbNetv$!
z_epF-Upv|crIvT}9uF>i&MTxdg=9TtP+zD48+PbksM&*M1E)@&ER%*f!IxHO^J`ON
zPBuM|-ze5e6QfYOpc6k&TU-JngH)AmZEX<+X>*O-+ouy*jVclUpF#-8zJ1#tI{?Lb
zI=W&APgFz3q5_wAIj87+dxBTEIz$|}K#iI;yFaz~;Ni3BbX33?{pk->Po9RqrDFfU
zY^rq~-iyosp)?do>#kT*N_)cb7fu!-5nsTLW6ik@zf71o(G&HELjk*UC78ED0kqUA
z%--eqggZ4yDO&IF@#9)1sUMUT&-s{_Z{EBFT!b%*5Qp_q)IDFtgYmwTmzM<!6n;~e
z!Jgpj3V#wfiXpqr+2;S+ql=4A7dW}O)lo>>pUca1#^de^e=f4A)20Qi<tDtF$csY%
z)NW^FM9Zoc7J+;ink4JtFUtBUW`UGr3ZS+l8B%lgF%F$Xw_R3%hl|~0XM$o>Vj>85
zyLL|9ljTKo2_yJGDrC3f+Y9EGP4h~ug6Uk%^6^noz$hmQB)S)kJ2t0!7ky-kjA&C|
zzVsxa^FkL^x_1%%kw7XW5_uIzuKTiC9Xod1U~eFr6oTj5Qx|K3r$uL@4jr;j&h!XK
z61!N~@}-r3$mT8kkHLFJ*%e=A$>*0J=aavFRsJDPrDm`58#XbTHD6NxeVD_qU*AdJ
zOc`(v#~?M`*Kt>`T)BWr<_E59bzO!%#%4*i$^Q=!$41H5st!d(DM`R--b3?n+Lu4b
zz4s&}RM_X8#gbe~C;FB=s)GKu%O;XW{8nCGj(gCVCDQ?$*uVbx@r^x=+Af$4A`}Mx
zofYW|SJKD?v91Tz9kp)S<jLe81IA596u~ogOWDO|Fh3kwbMfmPN8kq5^qer=&cq~(
z$1F__$t`&XoJE%dhZoWdl#U8DpjWFRv;GE}qM(2mtM&>adTW4Q!^y3T3=LV(+RYfG
zh0wnI_Gic+{s5XkkENt6I@pzNtq?#NSjD0IV)Kd6U=dQik)u0ha^2wwTzZQNQZ^vF
zIY?E#$yn}a=|cF|JF|<@Avn`BHqN3VQ60)S-vf{LjG^b;Cha?LfO*j>nyXb9@4KLI
z5v4wW_Q^AsXtQ&N-lt?kVNn(nE$?{je(N=S`;=5}-LW7+?K*YB;$Yj7pO#hS4Y@rL
zIIHjg2Qp9up?K_}L*a*EQD?HWN(xXwAzTTYC^}kSVFg=KXpL021l=b1($J!q6EtV@
z=FP1TRg1qn3U&%_i5cZ5O2mWGl;op)8&?!LWV=JNeSB=t>mFPTn*_%px5{#OH}V#O
z<Iu-^nd{~e056pqO;ODNrpA&q5U!U@wV>Km%cDh=N%20;%c~Zj8_EFFtJC_PE5|Jj
zp7TX8^TQiHB4O`0dv?;#a-!?if(}MT3>9ub;|wy@^MsR61p?Jh9OdG&3KYi^6kNbg
zbaq0CV5Culuao~YGkbG8r4EH)RGOnUH5U|d-DSNw2X}i{-;mKWE-<)j+Py<UT&>of
zclRyQQd1Z66(!Pgkw0_fI<i6^z=Ev}G{=#Glci_<YLW^KO!0TmaiR)=Sd)R5w9q*L
zhl%=16ym>ko_KtFd;05$Z5a)40q?Dr1Ryv<phpXx|6XRP&o=@Al~x92XB`IG9)aE&
zocYpjkse#QQAj3qNh>n8i=JkjErN&&LkPqN&abNSGV4f=zZUMSvO+Z5s3C~Y<4B=y
z=j5aV1AI7nuA)Bl@h!a;&e{Go757ly0g7CLX>U4|ckkVcK7QQgX~ElNQSoH1@OlYz
z=fUP-?5}`wB*Rv^cI{d^`s=>^`=h~S#2Jg!L>M`oa;QB`;RyQ0Ov-Hn!CQKN5UQ9k
zW<7awY>s4e!v6C0^$mS|U@beicwDRMjU0ev-OA3c4(@@aNtC=^fRt))N%tj5BM!ZM
z9qDn2W|){SOXG3@B<4}?w>=<=q&r(uy29X>84B;4L=&Z@>A&Q|g*q(MhR`~ys#mH;
z>W2yRBL)#}9!uL;Ru_&-O0Tq~fsp}nRAGN>Z(LkG;K$N;;5${{vMec+gW<6Tj2Ygr
z_*?nAW4EL^Ne9G=<n(-6K_RTd!otFN>dr8ICqq^H{99Yc9l-#x916z24H`ECB8fZ?
zw(a_pPZz15Pdht+^%Th?@0j59xIHV&CSh2p%JZkrH|9TGSjS`O=groge`HA&g)!WD
zL^}Myq)=u^Vc6!)0c6#~G7U)+xZPTsm^36Z4K~17Jp0(1>MSbu5zpCr>C&{HpH`|8
z-V6bhDAJS^fJi-kZqzyKJZh9WsSAk{iCHwL+LMyinVvg(&Tqi6aL5x?lF&|f@7`@)
zwO>9G<$HHdYtWxYuZoKkH&h77=<KXP%oy2qJSDNhpW;H8Vl&T{8*;|k&iqeOP{aX$
z7ZkiA40=C379X#sFwgx^0pV3f+<8=%HZ9*mRdD#Q3B!Qb(Gu)%2qI?QYx-gw`Bj@Y
z=XzNjwA_ZP3iKQ?H>ZdHd~ay)bkL3syLTTh8=hbB?8o!4!IFq{smX_*T%w(q&3NeY
z0zQ<)dYq^1Y;6P5AAVV5-|*`7>tenu`6W#1VR^T)0RL`5b@Q{L#Lo`sQ%HVvW@X+1
zrYd|+W?Z8SJ)frQ<o7cX@)cs>p5e}0EsGse&JCT8GQ@k?fjxT^RNZX845WbJiE**9
zPVH3M-x)J_pQTZ!PM6e8Qg6qHeW&<W`dQUEr6i!~n908Dfc9lrhVI#dy@Jf}-bhaZ
zxV4O~`{sWo#ykJ0xm9LFksSf};w`W#6fcrypj6hh+z@`&Cq0mDU}}1#Z|2GsE0mEz
ziN}v<IC<?>O+sc<(a9CaAAG;;8FR;E_75y*v5<l2`GQ`SxWtp2eY$_{_fzx#c64`7
z%l)=Dwa16blC3yxj#>KIb>B3-C1?F;X4X+8{E_fgi%AepDQDhSmZUxNkTa}@W0Qo?
zj$#F&06wXsh)qnq#3`n_TS3ZMp2B7s_AJe>%ll9<KFQCYD2`aD`Aa{N^33ZV3A0Bg
z=>hh&wzO=)(#tF&jMHA{UB7m1CF$wUo;PmXkem(dX30P{!<6L(U?6JGysuSs1)MyS
z?z=fbBP7#1amp>edNyqGVKY@#9xl5C<tbh4%p)S21h_jxfmiJL@L1{jxvGkp2dA+;
z#S(Mp%}2lU2s1M?9n^!#psBSoLV)C5Iuo@m)c?6`33aU@?HU>ymImfsomWrEiVWqT
z83=(QA@uE&bC*`0IY6R`3Z(S*Kz0>%?~rD+z%sbLLiy4PnH5gdCRS<{e}t>LH#xZp
z`O>x}*8$hXw%D(!0*YB6e;{b0QgPaY8-$amU4HiYTJnTVE%-Gt1%djogVXE{HqI;I
z5myhy`thu4M%M$ZQ(5;I8q^g1GXA4nMB-uL;VL|Sj0DDvtKeeP`tAamLz{O)47>Eq
z1M|KQN(MSQ27(l)3>A18n0VT!Eruk7zRl-L2J9R;hZa1I7%Hmh)YRr^xR8a_#t%yr
z)xEwxo~uxJdwcT^dLU3;MN$PkfI_6*lLRkDk19l%42GecG+eaHl9X`uazC%6+h6qd
zZS`<5Dw%d2I!x;}Y1y*g@Nqs~(wQj}9!QX~_|Q&c#;gZfR#uR&({PljpwhHwH;V*@
z&`6fk$|PRt>mOTM)C{p%08!rA$;qD%Y4T#1Vi}Yn*4)zkp>`s{5RVFW!)20N0+f;h
z*@78msRSCh6zM#W*RNkovF?CGHX`C)@lsefV)iO%P^$Y)SlAL<0Hwq-1SkS6cj}Mg
zGfYNkN(L6TbLTq9=Pv~FBMAoT7?l=v69b;7PoFX+r62b<)VL_zUictEy;pM6NTh+q
zqwpsd%08nU$~S!G`kG1z9mrUM4HKEp)A>!pWnGF3%&QaAyOz*Q;5Xon{)qYtW)&UO
z_{#mI6w1LV{xCk2SenvXEI4ig;@g1!O$B=SFxwD>aWC%-w#?+zG1}2|_Dh>3W!ZKM
zKIgh2uo6$G@`u0xXek>{m@ol^fGGp!Fh)d_DXd}+DmQ%)N<euidU~Ba_70pdp$)B=
zYUW!+gf$?vV5%S%A4nRF4ps2iBbe=Q@mRaLHKoUN302O>w52l$o}x&ktdZ?3AETNp
zV$i0I(y95=Z_`4N@TxO2HQUH-;zUCMJ7KqCQY1QXT++#;!Vb4LHWscv4bkP&{lcDu
z_O=XWq5=1?;)jc~vl4`8A<IKvDgB5X#|vIvcLpW0IZP+tZpn-om&%>=GDJs7j<SII
z-wbUmyXEpZ4;L2|G8#!Mqz6LOG%_+0A3p#dFVy+(^OY4as<}uV-;DP3)UW(mU|CBc
zrhxR#qhexI2w=F01wuJyzvUI}PLfF&<YnELd^?iUuSg>3Kj`2AkbDV*v>*q++t#p2
zG(6;8_kCOB;gZ)<bM+BShNAWonm*TkR|5m>)Z~dBI%ND$#r7);gw|BAreYf#S_58a
z0a<{A*3*3zrvv|9=D|jJ$t$({H=tx=bO9Xp*H4V*fBlD=0#vHS|7R5`FJSZ^1L^<&
ce&M6v>7S0RH`EVatKg@<t)oq%_1G2v1A>tXZU6uP

literal 0
HcmV?d00001

diff --git a/extract-text-data/index.html b/extract-text-data/index.html
new file mode 100644
index 0000000..c25443b
--- /dev/null
+++ b/extract-text-data/index.html
@@ -0,0 +1,683 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/extract-text-data/">
+      
+      
+        <link rel="prev" href="../config-files/">
+      
+      
+        <link rel="next" href="../add-your-own-data/">
+      
+      
+      <link rel="icon" href="../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Extract text data - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#download-and-text-extraction" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href=".." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Extract text data
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href=".." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href=".." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../datasets/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#download-and-text-extraction" class="md-nav__link">
+    <span class="md-ellipsis">
+      Download and text extraction
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#download-and-text-extraction" class="md-nav__link">
+    <span class="md-ellipsis">
+      Download and text extraction
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+  <h1>Extract text data</h1>
+
+<h3 id="download-and-text-extraction">Download and text extraction</h3>
+<p>To download and extract the plain-text of one or more datasets, run the following command:</p>
+<div class="language-bash highlight"><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a>llm-datasets<span class="w"> </span>extract_text<span class="w"> </span><span class="nv">$DATASET_ID</span><span class="w"> </span><span class="nv">$OUTPUT_DIR</span>
+</span></code></pre></div>
+<p>By default, output is saved as JSONL files. To change the output format, you can use the <code>--output_format</code> argument as below:</p>
+<div class="language-bash highlight"><pre><span></span><code><span id="__span-1-1"><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a>llm-datasets<span class="w"> </span>extract_text<span class="w"> </span><span class="nv">$DATASET_ID</span><span class="w"> </span><span class="nv">$OUTPUT_DIR</span><span class="w"> </span>--output_format<span class="w"> </span>parquet<span class="w">  </span>--output_compression<span class="w"> </span>zstd
+</span></code></pre></div>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "..", "features": [], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/getting-started/index.html b/getting-started/index.html
new file mode 100644
index 0000000..9296892
--- /dev/null
+++ b/getting-started/index.html
@@ -0,0 +1,805 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/getting-started/">
+      
+      
+        <link rel="prev" href="..">
+      
+      
+        <link rel="next" href="../overview/">
+      
+      
+      <link rel="icon" href="../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Getting started - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#getting-started" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href=".." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Getting started
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href=".." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href=".." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+        
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#installation" class="md-nav__link">
+    <span class="md-ellipsis">
+      Installation
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#quick-start" class="md-nav__link">
+    <span class="md-ellipsis">
+      Quick start
+    </span>
+  </a>
+  
+    <nav class="md-nav" aria-label="Quick start">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#download-and-text-extraction" class="md-nav__link">
+    <span class="md-ellipsis">
+      Download and text extraction
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#available-datasets" class="md-nav__link">
+    <span class="md-ellipsis">
+      Available datasets
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#pipeline-commands" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pipeline commands
+    </span>
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../datasets/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#installation" class="md-nav__link">
+    <span class="md-ellipsis">
+      Installation
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#quick-start" class="md-nav__link">
+    <span class="md-ellipsis">
+      Quick start
+    </span>
+  </a>
+  
+    <nav class="md-nav" aria-label="Quick start">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#download-and-text-extraction" class="md-nav__link">
+    <span class="md-ellipsis">
+      Download and text extraction
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#available-datasets" class="md-nav__link">
+    <span class="md-ellipsis">
+      Available datasets
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#pipeline-commands" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pipeline commands
+    </span>
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="getting-started">Getting Started</h1>
+<h2 id="installation">Installation</h2>
+<p>Install the <code>llm-datasets</code> package with <a href="https://pypi.org/project/llm-datasets/">pip</a>:</p>
+<div class="language-bash highlight"><pre><span></span><code><span id="__span-0-1"><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a>pip<span class="w"> </span>install<span class="w"> </span>llm-datasets
+</span></code></pre></div>
+<p>In order to keep the package minimal by default, <code>llm-datasets</code> comes with optional dependencies useful for some use cases.
+For example, if you want to have the text extraction for all available datasets, run:</p>
+<div class="language-bash highlight"><pre><span></span><code><span id="__span-1-1"><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a>pip<span class="w"> </span>install<span class="w"> </span>llm-datasets<span class="o">[</span>datasets<span class="o">]</span>
+</span></code></pre></div>
+<h2 id="quick-start">Quick start</h2>
+<h3 id="download-and-text-extraction">Download and text extraction</h3>
+<p>To download and extract the plain-text of one or more datasets, run the following command:</p>
+<div class="language-bash highlight"><pre><span></span><code><span id="__span-2-1"><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a>llm-datasets<span class="w"> </span>extract_text<span class="w"> </span><span class="nv">$DATASET_ID</span><span class="w"> </span><span class="nv">$OUTPUT_DIR</span>
+</span></code></pre></div>
+<p>By default, output is saved as JSONL files. To change the output format, you can use the <code>--output_format</code> argument as below:</p>
+<div class="language-bash highlight"><pre><span></span><code><span id="__span-3-1"><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a>llm-datasets<span class="w"> </span>extract_text<span class="w"> </span><span class="nv">$DATASET_ID</span><span class="w"> </span><span class="nv">$OUTPUT_DIR</span><span class="w"> </span>--output_format<span class="w"> </span>parquet<span class="w">  </span>--output_compression<span class="w"> </span>zstd
+</span></code></pre></div>
+<h3 id="available-datasets">Available datasets</h3>
+<p>A list or table with all available datasets can be print with the follow command:</p>
+<div class="language-bash highlight"><pre><span></span><code><span id="__span-4-1"><a id="__codelineno-4-1" name="__codelineno-4-1" href="#__codelineno-4-1"></a>llm-datasets<span class="w"> </span>print_stats<span class="w"> </span>--print_output<span class="w"> </span>md
+</span></code></pre></div>
+<h3 id="pipeline-commands">Pipeline commands</h3>
+<div class="language-text highlight"><pre><span></span><code><span id="__span-5-1"><a id="__codelineno-5-1" name="__codelineno-5-1" href="#__codelineno-5-1"></a>usage: llm-datasets &lt;command&gt; [&lt;args&gt;]
+</span><span id="__span-5-2"><a id="__codelineno-5-2" name="__codelineno-5-2" href="#__codelineno-5-2"></a>
+</span><span id="__span-5-3"><a id="__codelineno-5-3" name="__codelineno-5-3" href="#__codelineno-5-3"></a>positional arguments:
+</span><span id="__span-5-4"><a id="__codelineno-5-4" name="__codelineno-5-4" href="#__codelineno-5-4"></a>  {chunkify,collect_metrics,compose,convert_parquet_to_jsonl,extract_text,hf_upload,print_stats,shuffle,train_tokenizer}
+</span><span id="__span-5-5"><a id="__codelineno-5-5" name="__codelineno-5-5" href="#__codelineno-5-5"></a>                        llm-datasets command helpers
+</span><span id="__span-5-6"><a id="__codelineno-5-6" name="__codelineno-5-6" href="#__codelineno-5-6"></a>    chunkify            Split the individual datasets into equally-sized file chunks (based on bytes or rows)
+</span><span id="__span-5-7"><a id="__codelineno-5-7" name="__codelineno-5-7" href="#__codelineno-5-7"></a>    collect_metrics     Collect metrics (token count etc.) from extracted texts
+</span><span id="__span-5-8"><a id="__codelineno-5-8" name="__codelineno-5-8" href="#__codelineno-5-8"></a>    compose             Compose the final train/validation set based on the individual datasets
+</span><span id="__span-5-9"><a id="__codelineno-5-9" name="__codelineno-5-9" href="#__codelineno-5-9"></a>    convert_parquet_to_jsonl
+</span><span id="__span-5-10"><a id="__codelineno-5-10" name="__codelineno-5-10" href="#__codelineno-5-10"></a>                        Convert Parquet files to JSONL
+</span><span id="__span-5-11"><a id="__codelineno-5-11" name="__codelineno-5-11" href="#__codelineno-5-11"></a>    extract_text        Extract text from raw datasets
+</span><span id="__span-5-12"><a id="__codelineno-5-12" name="__codelineno-5-12" href="#__codelineno-5-12"></a>    hf_upload           Upload files or directories to Huggingface Hub.
+</span><span id="__span-5-13"><a id="__codelineno-5-13" name="__codelineno-5-13" href="#__codelineno-5-13"></a>    print_stats         Print dataset statistics as CSV, Markdown, ...
+</span><span id="__span-5-14"><a id="__codelineno-5-14" name="__codelineno-5-14" href="#__codelineno-5-14"></a>    shuffle             Shuffle the individual datasets on the file-chunk level (no global shuffle!)
+</span><span id="__span-5-15"><a id="__codelineno-5-15" name="__codelineno-5-15" href="#__codelineno-5-15"></a>    train_tokenizer     Train a tokenizer (only: sentencepiece supproted)
+</span><span id="__span-5-16"><a id="__codelineno-5-16" name="__codelineno-5-16" href="#__codelineno-5-16"></a>
+</span><span id="__span-5-17"><a id="__codelineno-5-17" name="__codelineno-5-17" href="#__codelineno-5-17"></a>options:
+</span><span id="__span-5-18"><a id="__codelineno-5-18" name="__codelineno-5-18" href="#__codelineno-5-18"></a>  -h, --help            show this help message and exit
+</span></code></pre></div>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "..", "features": [], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/images/A_colorful_parrot_sitting_on_a_pile_of_books__whit-removebg-preview.png b/images/A_colorful_parrot_sitting_on_a_pile_of_books__whit-removebg-preview.png
new file mode 100644
index 0000000000000000000000000000000000000000..fbe900dae8abedbce69502677693ef0437334074
GIT binary patch
literal 173165
zcmeEuRaacm(rx1ef+e_1a1ZVT5AN>n1Z!xZ(ctdx?%ud0KyY^_xV!5m=iEES_YdyN
zH}>0p+N)%)S+lBksG__i3KBjN002PwE+wW606?Vx0MMq0@b7OJ0=UWmPyoPpv2UvG
z`o|pzzGeqXs|L)$W77gWCnLM<aq4YI<yazkF^R&4AE93j^S)Cpz^0qBKmu7&4ZCqE
zx_xVe5&aaSXsS4EwnpSp1*UmApI^IF*4;d{!vTSd-=7+<Zwm>II!m3)4`*oWuag;K
z{`dJ`3;eGI{?`KkpDpkM$QZ-dmM){?sjag?bhl=Ea~N)C>aI{2v#c|W1Dthwu-oxA
z%2_{2T<WlI(->mB32T{NbvhMmn-E&0#gcE}t$bOP*ir$Gp0;Jp-O5mBGyDkEB83{E
z1b!ReIhYZ2DOsb#?&LKX0n4={djeY&+kh>~jp{9y>SO6xkNn(u#L*dRSI%1$TeR~>
zI-TqDGW0i|>PHQg>kh;Qm5afA9cwy#eEo0hE&Rz1)uw)?e)SH1uq+nqMai$&LSEa!
zLG-g+*sTv14C>NzJF)aobcRw@2~ET~arE1j)do1(56<0*0G10hFP<xejcr&721RR#
zUCnAnHe=&kPS@0ZG;lPkC%IE;8jlKD*8^J$;C!GrU3?r7Mj~)-*Hh@1ZBX{{%o6J=
ziT-mMdg3;cxUiT}s3Ppl55>REYkODE2pi~0=q0c@T6g6z8k@=5(QqDKi)Jvti{z&{
zD;REz4g=Tg^v{y*#(Q&%kGH3S;^E@p##-S%b!WK7>uVE?bZxy}{EFOiv&F}~uS%@y
zaH`WL6QR|CGCY7G{0$?3%t6Qv%i=+{v47)EoA5!X-n$z`*(keTLOR*?K92w+q=vfR
zqJwaGzsa_}!&;?q>})~$sW{Y9USC^X-Xfmihn4obHwr;|T^03Z?KW>~Ljx0B39YNu
z#K%}!T%vMjjmov&wWeTYLv<+KpQK`}sKg*29A8TR5WxW@FN_ob7C*hn2c@X4@ffN(
z(`!FyTtDUY9{eO|TQ31WJOk_{JNl+%`KsMVhdTPE=b>cRIIEq1UMgOJ{$QBp^w~g+
z?>-$yNS`@rU6Uc9%AiuMv@)U60E%BV1A)+qZU%;&xE4SM5~uN6W4!e9Apuaq(qtDe
z#04@DpO}*N<b`Pys<e8Iy{a}G)Q~A8-~2^FAEX(DKgV=evS!w1Yr6OIu0B@*uFgdp
zh+)eIr!ky2)CohU6wmIwddlAtMR#?om%bTS^SZdU>DKAvIb_v#hZR&^LSy@d2m=6$
zOJRG^;)#Me(>ISY?uY^H#9L&cf?)@BxV7$Lo6+Zbx}R)~+V~T&Kl_4Y3wBES-M{36
zud;O|DT!Do)a`zN=FFvcd~CREUjkk5{&|*(ANh-K%X!rx4`1Gt4)KEOI`?gnZuR+S
zc!0?q1=3)02zCOzigXxF;qXx>;Lo&<2Q0cGdpy=H96YLsR-_pUN1aOOWJCbU@TX8j
z%Q_MR)b6Ra&afxDK(_6ng{}a#=r{LC1K%EtRVD+t?js<ty2IG#Z#RMywwP~~tF1fv
zdn4{vl7rBL&~GL0G4`BK)yj^uaG!gST~P(`F)q@^)2+m=&|uhV2<LXq0LQX?v!e$s
z7*UGTmiY;b-pXnw7Zc+#Y$7h0=&N9OS13e^r=`3#|3vN-zIXodDWzgo`@RPKzzh&9
zNl%1ztG12PZ1veV`9kr2YSspf3rsn4tz6e_MCmX~w@>g{%IbQ-VIHMV8l%(N*sOnd
zOZZtDSWA-sIx4@q6z5n&11WJ>eOT=%236&DI6qdihD#y*1Imwc>!g`Iy8ZI5KA~iX
z=;0@Xzie+aDwcmO;B>hkX#_tYV&a*XtpZ5pb>K;%uVkwmRY~D#(gsUWw;J*MWI_xP
zb?&RSNDB*}^s<(h&?jX*`_whs91eW?vZ~+eA`;Z)v8AxN>Hnp=`iT2qhl0l^G*s*$
z2_GSYk`8m%9N9x;iIgWxW%)owgM-0PWHISn&FY4E{BA@^06z1%#l~heEID*Z%(tt<
z@7S|q^*=qNdvhyJ$z>-j!c0p#ex`ktp_?rlzp-qDDM56kxu<U(q>WRWG)PxDl;v~w
zn)GQ^_nXqyvRcuwnK9sh;2<bwex{F)V$s@|YJ9hm%-vE|rq%-*3FuoP`}xzTU+wA%
z{s>ELW0*9ER#~Z1^aiu=H*~iqD<DFqtv5KMOG?i;67)+?xCYgbP@5kM5+#5g-7WJ~
z%hBKnQ8wW<tk2s{la|dQ)@p0jpKa!`gY#ZgTb49$6sYPm>4^A)`=`903Jo-UuK?XD
z`e4ErmFbLnYv<GMf!_n+(cUj}FbV-O8S<=J8~zRNUx$2Im(dYW)?v2Zgp$nle6~1*
z?iz0{<p^>E7rdI=-JsH3_(h`*!^RB85+l>HgNgm9%j->DOrRwH5IdJ$JhsYklQ19C
z0|p;;Vac;N!-wNehniHa4*>XNWrNdRS_L%*RB5GL_?*ISU9RbVW^<xNf2<+eCEEFl
zbwN>m6s4TMP)1YBE8{(*67IYn+uS(Evc#v8;@)$wQxSO}v7k$420Gv8aiQs>D*f9N
zCT)$pTOSb*?7AZ99n~KI4G+>SDk>5t|51T<7&^s%C=Y3~WnsBGyiH%!3jm$9&JmcJ
zMrU#;o3m+1t~wdC&{g~gme_*`=@s6WJ1d6ns4+)~eznhHR(}}6IKDXRTn}8FVO?&o
zF(trGwv*jhu%04UsS&4N_Np{5wlM*w^j}OGrQ_q*z+F$`2W}>%PX3`dzNnMA{!~0I
zjoqBX7(<?d{CMegfY7uvj@EA3tn2dz^^;<`ro*Yw!!m`TR*v`bsvv3Ra6v_TsP>B(
z!(qzV_a<f49U6qshpb_SA_aeG%Unm}7Nbk5&+5A=v9Cpc<h}i5*T`m1hDqFt0#Kn9
z`(ongy?Z_OgF(0|XDf?EQ>LK(INyDVfXQz2i6*z*?4lNdWrJ;q$#AV<(;_9^@bXtc
zt2ROH!E+7~30|Ej0oS5mEaUC{zB(T_x2cAJ`(gjvBaXpkcBhL?5ywB&>%@FVy$9tA
zpkFoH>m1Cwrp+=5@!4F3kfNhivKO%_nX+S$0|~Y|Lxrb+co#GRZeh>)^C4J?NaCWh
zc=*RYlQEv~A6AOPaI>OwlDOg$W7SV8rusUQ+53I3S~PP<9m&e78jdxaY{EqiHI^C*
z;|rORx69$t{MPftVE+=cIe4Bbkv5ac9W(k|LPg<99)*Hw>dYe7HtT!SHpU6ZTwc?9
z(Ji2*OSnLtO>k!2?|}5vZ17ri=T<*9^SMkiDznx`+`nr>=FrJ$@9hosMWT4}<#fks
zwSrUliwJYLhtBr$EUm)i3i9K1qZAy^u=|eQnM`R7B8zk=s3E^Zg6J`IO%pcFHT7#M
zOZ>JQ;DhC7c4n+bMAHPiD`M=ZZyQDPYgc(8@DSX7_DS0X=i;K{vGEl%1IK-%Hrv-`
z6cy(|j`XbDt5LcssPFuTmqgCGwmpTfjdIr9yeY(nt-2)(BLgLJcFV@`!5ZGDHm|pD
z&&QO&mg$2p$hs5%pzNDL7W3%_{OiV~&^|W&t5&{x6F)<3)_(sGPbF*TPm+ReK!pAT
zIQ9;WriSY5w~aV+To{I4OGQbbct^a!M%}6F?zX@p_S#PkrIo3sY8G)r@`AzGSL1ar
zDo#z3njgMaOv@W1-`nE*kM2(DafrP}IL8QUJ<xt!SO%}w9WPx>A>?nNa%6n${$fB;
z)d&ba1ECa0xDXS)%|fLU6VwbYu5j@mtu?(2;#kZ!xYP;PbthZDGdY8@9j2O_pf}5P
z9~+3%TOJ3ClypP`Zr|C&a>$lz?T{w%Aiyk>STiRnP^2r5K%DU~@Brp3xNt-@U=Sb{
z8&N!y*L8;jU81<*#>m=_3l;VoNm++(EnHvrXKZl@v*AGakDQ`G`C<4y>XRUFc4M_r
z&A7e*sF$iJ8YCnWnvn9S!ws|Kx{&4D14aK8lm9Cc;J_3b3N<!7AVT>^yE$(WR><a(
z?};k-zSp?;cE#q@r#~0@__DC)J9hII#u&ivr{)Tnx4qnK#$+3zdd)H@RI{UI?QLE#
z%S{#co<^3?*Is_~i^C4S^X^k+W@!L!v@yT9#7mzIGc-yd4KW*Rd7r_!_L~gDGeT=!
zUUE%|1ch5iZk$5;aino)*y9uyflfw~#Ta2Dj-W?1c-HR)c=Mapa&VYs&`JIJfyEj!
z!`5q7_`5EIW_!&zzew49>g#lB@0QDN0_kyRf^G)1NKm0>y?Wzy?7`ZEOqIvJLxQY~
z&p(Qwko!&%_yhk4;G`43W+HU8YBF@0R0$C#MyPR9TbGpNFyrAKfcj1!7uGj_Ct#2f
z!eodL_G<h@T<bVneP*Tn!`c3K3Q7X*ttJD3$pedkK~&Fxhw-&;pWGw@w5?j8I9^mz
zex2sB!pykfTqM6JbGK^4oV8-1k-;xAY9=3uhjeed=xGi>adj8bIxP=7JA-4e$iqb-
z7AlAd;+sCgdvp`|_$6jk$E4Z*ME5|UpZMXLV65-v`;!#5JFm_=uNC*YL3n)<Xi`H8
zyc@xm48j7s<Fu!!#lGwqCqN_k!Hv|kbIHQ1{$M!|)~%k~5f#U+3!c)bn1(S-(7(D-
zT|?iXkydJzJD`cC@`ut>T70tgsL8ofuU7LqG^-Y!rZ}hsTFkbV>m4yRFf=XIKm+Ov
zSuLu0TgE#Nx&t6jwVt*GEvB>l6)e?@h<YK~IUS+OC$pZjHmW|3{=)ZbgD`lZ-O*o$
z;ewo9W-W*k*J|))O_xlzAXEFV$A4h%URT7u9;S1tk#*1irSLC<ym24jKrZkh-j4L^
zYUEI_SQa@@q4dikfmypH!dwddN}H4!=d$=@a5sg$!p5dtj2I6R@A-1oU7)VFL6lx8
zt2N65FX*d2vMzt!1&svUO*7G8`-C3}K1U$ujMjorta%cW6@+|>k;j^p@((~2uQ9`6
zud;rkVQ%|7f%~<6*rVGr7m0T-P=VN7dk0AqcflG&P4419QP(^h7on6EF(B^r24xCh
z_+TKBGotLIe~hya+OnX*zR^`xAt(}5qhGq*jpP(sZn59{HO}u9`wBZ{^xxxUKI1%7
zKDHsGyxnE5Bvq@vRq!=CiUiuu62mgl`8Pq~7Y1HZhCS0wA%B)ijdi^|j!oWHOAZlt
ztfT;WA#qg&omFYQ>OLmtU{*!|#AIWkaN{CBMDYUrrZlt_`X?z+cKbYuSJjhM6QU3n
zlYu>Sl|hXJ(82y{Wt7W3e^<G+8;AFbDt}y;U}@k~E{IOTp$~^UGZ@lHu}C!RR<NpE
z8P3h>sX9Ai0fgevfA!7tM#}pAF?a%9fMp7klV(Y=T^`NES!mUO7_S6U!CsD<@8Q?y
zm?^kN<GC}HtqIyPYhe;#QYKVq&A5);ZvihTw>T{>YkA0{L~Z0P1^PqHUc8(d9G&~6
zXtk@|meXM4`A+<r+{>ZfE4fUzni!!FT%<-iC}Sd50|*!8w+6l6?(8D^jAI1IY<daB
z@n4-VRLf)O2ukKEvK(|$?Vnd-!JJ;tj?2J9-&J<c6-a|d<wv#D^TA&%#UbLyZ2r_5
zBr#vQ_BLHcR8^(g9(WMFzShiuB#iy$12H?eLErlE{s`D467CsqXxcYgMAkme6D>M!
z-DXOXaK9h%;}qHdh)Qar4WITWi=o87d`qn=4inVSCKc;b5edM3@Z_q{*|M_qA0^L%
zIM>->s9`@(X<5DVDY7WHyRkVD-a6$5zgQ@GG_?xld=q`SRBiq8%>o^<kC8#WG|sSc
z<#SN5yuPuw>C@d`lS<DGaRC?~8qt*PANnY5DjBp2YYlY>!hwBL-`kA-Hn09L&bBL7
z@yTfsG5EMgD&2^4kT{0an3MsZxufXIDcraB>nHh}_Lr?dumw(UnsSJ_h|l;V4Lks|
zoIw&Lgodsuqbc_^gCli!5UI<n_T}T$#e2>F5nGBQCHya4pGn2bGj$bRBnp3P48cWO
z6a8nK-Wp{Ux?WF4@|D@)wns$M$;b|PjA^Yf-3e+Pf~7(c!5>OY8Wlk>OpxTtmR7@_
zolcZH0!T|*eWWNsg&?<Kh_9B+LET}qTy=w8EuTr@ueedghHPpQS0qxaYR*c?RL$iS
z5}apuuwWY0A-^Y|4Dez5GjJ>P9a}a8m%oB$IP0?=51BZMCHptqg~@CVBg%$RNruw;
z{5$*U^x39Dp!}pl^xOTWzlnEn6H5fP=LDr&5tK=WW^6FD;K!)d1?GV*w|<Xk=!yvi
zxReY+<3z$K!#Ypu($)J-<{G5wsr}UkK;_(9{_Wm9q<}s!Y2H?7IbxQo4j=bK6xpTu
zA#>ZgzAK;<KK}7D2**MPHsz9?1ENG{&qW!zUOOp048&4D>C>>QvuToFK<9|>D-_qU
z9$7+K_S#(d%*kK(OI-$wpd|4abEEceogVQ1=VGlW@|;#yEi~yg1(^EVi{FlwvQhCd
zi28QfOp>@Zlir!Lxla19GWZgiwk3|7G8S2+-dyVByAb2V>a%m<9yBelasphMNIZHo
zH+WIS%Tb%E+6)8Bc!vJahOZItxBmoWb@Qr<aW+{0yt_Lol8Y&uAB^#WIhFhFX-sS>
zj1h6i)aJjh7ib+c8cZEsDtKEViH~_zIbUHp5{TuOrX};flp|IfL!OMW&RS+QB>7|I
zf3^IHb2K6vtxFCmbm=a!!hGP*E}#D90LkaJTfaY6@?2$W(mcEV*7jsV(ik{v*0y(n
zQ28zPIjr2F_OR##qKIFoH!n0zEaW=Wdeh2Sk9Lqgi}%dY_~v_~W#KPnK0g_J5|l4_
zJQN>nYk5FZT8Bk2p~xy+#~554Ym^ZI5U1sY@|Cg@bkZvROG;sjm|ar!>9Y|TIYuXU
zIvF}zSOL6QTF?+zyK?uY^NcB%GN>EhoGv88PI7Twy6W-KW(nB`82gMlW%O<W_J1bm
zb1EI`GqBwWE(x$%qA>iGA53wE#J}yUcO1*-vb!*h()Ka@kWPPv+I9PP9GSn;YTyvc
z4zAVKk&bJ7Ju&jmQaI;|fEAx=HFt|C$xFFV6E!Mv1KP?BUU>w{Qc|sw=f0kg)y-o_
z$NABkA%RY@o%sVMQoo5h9x=43x-FsfWzdPZPODEB%9-cyDsJF_L=;gyYuE!%P+%$N
z3`LrgIJ-rrBNN>mGKK2ZMaY+S9AzZKjMU?NI1$kj7+F|U$I;(=#vMwPq3yovy3D>Z
zQTWQ1Ei*P}jYf-c5CaCmY_1Q6Oi4b-2i(l~|Ku24rdc7MJ)33v%)KSR7`pHQXy*4Z
z<}BGtOBgM-iWs*%)?CMjDV~1mTqrTjS2f&mWbK9sd~71C=IXtw&YV2>$f(ta0N@Qu
z0K{|J7ra4n28mvAtp;CajW67R-`o~&=Jjpf&c@fwD>v)(Aa=pvXFZ1|3m|sMI{HIG
zE5xzEyfD;g`L2ni@r14Tc+r`<!uk9g0k44B?2cN20rUoK9J)x%KJt>qqYn*1;46;L
z=K;rBk4;|@cxGS9$zX+c)r76ZhKXW>iz21#et>*ozpo}jLKc?}uQgBN5*d!mCC0>Y
z`RswNE9>*eDb{yC+y3Jd7`nA0h4*KVeY}%!$~T|F3;M;Z7#-}dd2YUmh=09}Q&fuO
zKQf#*WRmllImV>La}%?OT7AYeBt;e`!uAZ8e<J2c%EcD#w0Fllayxc7>smh*a-7k>
z<zQ~qv2R}TzO?Z9Ma%~pO41Q%Gv32~3e%~oz3aTHyk!(RuRh*C2tvsZ;N)vG=Hont
zZ07a=j{`^=^oC{BtwYo+r073&)POWlz4Zz<RV{LbVR4dIqUR+@=;?ALr`4d^o2zNW
zUIgimuQHWQ<;ZKu_o^>DyGwR%Xz~&w;P*^@V+4<Cd<w^&(VZSDG5nbP4Mcf+P5r|A
z-&IGZYnvNJhc_q*7z2%hfz=afCOF=@gd?onlG0DU`&X&t>Fq&3lva|hYdgn}JUi*n
zGp+^@VwA3P);<k3*ANYE$PiSlYC`0l=dx~&^&p;C=nb1H!21Cu8y3Lv4zpL*%aG5T
zoTr!dQkr!ALy0ku71mJt?Ju{t##N`~;gi@Wk~$rSwt!Zo16FHhzH*n*Y{8QyOX#2p
zgwh0!2yGW!VvQ0?>KUC3wMrbi1^Pa$O}4zuiT3eS_ot}Nacg!EK(aN1aof%uu<`fe
zgK&+_@P4wImK)4^wJ<ND5HNeIg(?xo2(L)TYQhp$?mtZ4V*dTyV^sp<hjUH_Pt-Tw
zrFSZzZ8;B@U-Y~e-Z-vf@t226XE7@sQzn8evSU+$B%-D&q)rEJhR(lI4jpmMB4ee=
zTNWtoGtR;))w<0RUz4&T?eH3O>|6ZroYkShRGbl7ZjRlDh2rU%seaS|>WJ%=UFlVQ
z(6B3TxC-it2F#HCaNmEN+5|3?vn8(G*@0_w8n;jHx^yZ(>FPfoV6_0!7E+p-Xk6e7
z^zwdZ4WiW=l5yy%8IAcpm0dG4t;f#=z#dyFwBeO54qX5^(S|L1?h$c%yr^+PUg(ap
zGfWcX#v#JxYj+xFHL-&E`PtmIU|)qK<%IkbnLXRw`~{j9O5`ud8Fud(&$Emz5WY+i
zdZiK*&o$_GOS|u>9YGG#7!9v!d91|Bi;u#oY!mxxt$YVTJ6%JDP8&n=1q=X~PQP&k
zi_@|}C?;q@Y!siE;ED|7ZJTuTuL;u^er^lBFqWBR@H-A+@9k-sPBNv_EJ1~|I~_I|
zIJbar>Z_XiPoDq>H5h!Cz5ex`Y3205{cWXXmeyY$6clqS+(tWot*+|Wh3>lm8uetw
zKR1yAC`D4eCP+T3aB`awwT%^tf*Jx6<ji4g*xadj!sUbB`uEIT^F?^I&)rW#w&Sl^
zXSGXL_Vu%gf+(2{C$_kaa~GnJhq_7#D90SZ=PcUxu)}AVl&zO@QGQy*#!d$=Y<$y9
zCSCen?hN^d2Y#=Zm({O5%2oRJW4fS))`*YTmfKKp`A4gTN3(qj*W}#?w5~;MU<bwR
zx3XdOmObj<wCJHUr3DAEWqK|N7E@lh`y+O(x6U&%Wy{P>e~B~}is;>zMpgth4Fw2@
zDi(H$+D>2vnEMsRmTu(&AZ|r=#k)2Ti@l$#&UY}7H^2T0URLV>0!}<Lyk&?Pfm{j&
zq8+v!mY5~;h^GsCn6a4#m=6#(1K=$fK+=c#fD(tNv6i3wqv4w;o5WZDek#NuEZfAJ
zs7bE+n{s`lCa<HW<s_VSD;;$m$cfEcsao}l$qW8>*5*0T?|9Cc1E5ix-TJNe5tPWX
zBEBV>ZA~FsS3eS)GYv+~$7S79*My7z*1;Bw|FWm<+xXn>W8aj_!Bffr8D=Vla%{gE
zo0(riQe%?4T%J^`QS(51!Og5;$v_{1vsI49^xi!E;<oFR@vLQk)9Xga;zCLyr+v+D
zNaWmd%P`#*f$g5KANy2YBhKu-W|JCC-@MS47SQRCJ9gM#;EtzB+iw47$JuAJ#ZGXv
zriu4lmg9~6GI(|8wmX=(nALkiJt_6#_r*sCs!E7s{SBwACJNw?Fdc~l_uF@MlM9N^
zIVH!$YnC6NB0h(x87mh>&oct7!~s25j#+nJPeM&Nuh-+6K0Vwii)7OHb&&*IVJO&-
z-XYRMYTIV_RjNUYRDaVh$~YV}>uk89m-S@&kFPz2g8Y5+@(3|xEiP^Kni?|yb#mDL
z8#?&XFgx=~A_!9WGrFn;1}Hhd^Mi|UE`rA!-I!0L2N+HQ+BM}nC`U{NQKGfHO=qMj
z2pEMRp$)UmuED{T{a%l+Qlq>*vR#A~cy(`_`gThL*W`VUyvI%Ts3X<OS)g$6H3bP|
z{b~AcagD^iBa^#fylP3iwy%`7ZR<f}adh6mq3gnD-E27IR{}kU4Z~zNqF&4Q-dR6e
z#nqZ$VF98v(-}ZWGVIrz;*`SAX}ZP=^390y<l&*U=%cW)``CLFOBaM^P3@alxBev0
zMmj^uC#PMYYB^#7({zXf#A4+Fddkg=pdz7CJYa7UCfLqnm%W;Rle`aVu4D6$9;d?6
zk7<pr->BeY`(c%P1H&gabIkB5gMT{KBII2<thbx2Z|Lx-y~naE{xY4h-~Cn8;yG@(
z<odg3DM5#Df^v~jt(<^F<4<4*Joj<K=5A%udPCo1PkM-UNwuQSKFVyRMtKWeUHDHm
zqjV_vLh1R#gi!=5V?q*!<-;O5;F}-C?WPT6@jmEVIxYdECm{d!7R_s^^Rb0%aW%Vh
z_|$!H6oP}7+9nliN6DDzpsr}lwD7TmIlajiiX((P2v9vqmaeP&ZTzFFMfVT!Iu`wN
zwAfJLUdg$VXv(ShL?)edjxCOKVT-v-dOqOdu+K)YU9E0i_G9wfZ&aa6MA;V96TbS}
z9--IjV|x83Z9eyn1R@@<6^X-qh{>LrI!T!}PVVS<`<(!AdjJfi_~MKGl>-^t0rGvn
zPPqN5Jok6}>pzu#tms{I%8;|kDzroC*mUbpmkCd%07g6OLYO+fY!|dE=%K2~)_Gej
z;u)j&RY(>nFSs^hxhVxG7AO<c02}RJctthJ8aUMVH7w3G6#P>@d3`ZHuu_jeYlk(w
zZ-=w2#FkMnCg|#5j}ZAPei0vhlHGBUZi_+t@^t#taI)4sYY&=3w%ro@6(ujf99wFZ
zu0^-d^!grH1pWOdGb`<p5z#erGf9)o^tR&G=1=Mc7d%q6O=98j%iX;(NX|e_!^jV+
z2@EP10@a;_;yh#Q!yuzKnsF}h6_=*x6V=nn+Y143nytDq`@WQ<sqcW+^XTk3N#MD(
zt5L3CD@5WrY5A;SaxzlvyKUgLk>gf#$2~#k*xFlvmqoVsI>O12Js#Fh#;7Ms(K)h^
zS!H0o%fXcSKf}%aYpVy|JJTc^%uvnF77E+*>;BIA4T?^a?rbrH^H68c?#yR;9<HjH
z;^5Nqxx%(lLC6Kd<fuAU%n+5WJgK0cnLnD2giYRW6Dl*kd=eIy^pbIU6XQe{!$x+Y
zikiRTYy~Z#5>w&h5!D9nn0XG;>yPp$XMp$P?-q+sJx74{1Y)@Hb{Z1LiPaC<p1bw-
zO+uh$zT+Z`^*=Pm=4~|M{TFl0%QnEL$~jVFP4M*xeB#;Q&mIQ4Ye5>uk+iP6uF0Aw
zI9sw)kNQ4;Z!#W@fM>OKZv69t0mMu7SvMc~K=&yx8+Xq!nWTDd&-YNClWg{oYF`QP
zXKNTWWn-Qg*?QZA0F}Zr<bA;>tm=lYXOb?v%FoocLRReZ8`0jQ_ztb;KF|j#?|oRf
zf5ab9ZQK?2G>1CrcgS?;;1_UwT+7s$Z0|gF)9Y~)slMWwnONPqwzMV2Xl?z)P>IbX
zIhltXi$m`=_UMeQx=)P(jwwvHp!#?Aa|`c=0#k$S^pDx1HWeU|vte1eFV1QPJtLtD
zIG|%Yaa@ebJQ%!j;$xNFvZsGYqSm$pYFdgMZ=R(q*jc-qR9KjuU*DI@P*&8>p*A!7
zU^p^wdwytB!6r272StXEawIO->*><&{l)M@e_l%A$ytDX3P37?jdafZ!R6^>yq0!<
zF<p|%b{z(7`R-*S`oWDCfkR8s>l`=p87p01w{dhU0x?D!>Y>R#gH?qzkP8Bi2G6>A
zXM?vD*3G#(I<I@yU+31}m~l3C6oL~NehmK8ui)HilcR{+*M{=d#kb1ktSf?7z<!Ig
z0_Kg^91`xP;aF9mqMhpICHw%)6EzK`&xORk-`6P;l8Q?cP>0LcL{=#gG<QOnL?Q#Y
zzXbyEv6fHbh`4KQJ-O8Oe-<k_+2%)+GyScJmSbAp!W^lF8hJfvI)ga)M?%`AmZC~F
zKHsgFQ6lYV=zi0K(RS(3gwe#VK+;5+%Ae9qp%iFbeB{zc&bhWt$$PterM3lKCSVPu
zeJ>NYVmX_`87uKz&&OUcZAaf|DBzPYu>g-V>1@~YmbMKoz!T(l)_-l<|KbNVc9h5r
zsy*rSDju~G190FUR35A(7}I|$+*vdXZdlluTjl&Q%9vqrLUcQ~-UX=k<}EPwlle~-
z(QU)iT<cw-SfK~;x@^9I!oj+0tLYU+r1`U4;XYc=&~!w1gbHB7rztYh{U|U=V^Uqa
z-)MgnYRYaZlfB3BH|{kl4>ORGaT+-gRLojGB2sO7N0=qq>oh^Nw`aRN!KsrXWHsJ$
zV(xhy8+>xYPBSK#4H*XNnj<?qi?M5Te`)I}Dl)gT!XT=PB}>=JXq^1}jer2ZAuyE|
zx2K@@GGHR7N#}Jpk5MGp{v%eUJ7L-NwST4jBz67ADZrPsbSf8G%c$_yn?!RMbX*@3
zHh_3^>@bY)=&7B~$s<8k)7uE|+ts{J54L&niCVQsD)>(5jhOGdUI{0AFKxVj06-}(
z+`xK}EH1ZXTyRM&&^S}z$MJ&0+b`zlreyEwt_N?8tndG=KGy&7r%t)9W5zX?6D&0F
zrOKT^SGcK6R2nk55$4W;$zppClGrD>)8B0?gKn<LRni^sPP8pw(1xCLzx9EqOaKuc
zrqTczJh>$DNrgsbQk6eMoeiTQyy`|d7)LqKiiXl=lf?WZA91KO#+c(%jJx8g6_##^
z(ks;ij35ng5RIs`e%58sBJtCGzmRd8+ewkLPr^pN4FSY<*OS*dt(SDhg|`HaA>;xQ
z_HlHdYJDB?=g+{{7xf~@G@pQ-Yhev`lAWVd(saTjpo065!mhTbW}BAwo5QlLsHfX9
zc@8TZm(9&2tLKT0fd%tJSn`5(Jn{Dsq&va$nC$6#;2C+EvD~de-oe)xgc&e?UcOU{
zZNBf{pp*12<1)`ng@^q{gIhQm0;zW@e&7C&P(i)2oOZnZ+H(!nuHcEzFPc&$f>yoN
zCjN0?pOdR77ca<$tBnnpOh`+rA&1=)(dIfUg7m5Di+1TmiPmuCQcx_^LeI(~a~`)W
zRA*2=5q_0UBxw;m=V-Odu~fccwgA7)Y)h7a4Iw{4g?3`pD?DnK#G##~S3S-;{W!xk
z_o5$sI}GbCHmomxt64gKTFg@F7n5BJ99v@b#h^(e5B^{Vp>D_BA>jT5)Ud;{*>G|6
zGN6OIV^;iV<m<DI7p{@Chh`DB9YH|=US1(>ip5ogk@uL5fgLp&lJ0lu^E;otQS>dk
z+hd(<(KETeBJ*Nfd&fVV^mVz+#V0pJzDZ1@R2y2AQ;TN|9FfvR=L&Y_$j1;o-Y12u
z6tB^<*W&folV$H?J|S1Ae_AvpvhMYNtfv24q_!clJznO3t#3Figlo4+_q&+?pJZM%
z@f9@=10I&VSi;h(uSCZoHJT*Sj`rC7f|kIeAM0859VW^1sh$qgevdLtWs#PPAD5VB
zddQ$>Y=`eTZN5I<BSt+CZyTELuVkC%EJQ{2c=sFl#m?&LHzf9nr}KON-kUsL>V&hx
zCC5J$JT&3?(>s*d2j}}S5dg4Fp%EQvQd13+gh!}^QPZwyB<4tcSj?FHQ`NAz_?@8Y
z^7UNFW{he1RT6cnDi`IJ8_Za|yZ_{84(=m(4Yhk4;yc+l9>y+xIrg#57I<51TE>i3
z1G~>~e#`8<_>*2KTv-Zmw>0$w6v|<$FjQ;9No2l(0wVo8s?;n+h=}kYj%kVt402)O
zz&*HoHmQH`KB+?*@7KDoupP(ir_=f7-ZH(mv>7GX1d86_JozcjF9Bu1tF!VsM^cN1
z{!Khv)@M*6jGj|zf(_1|L|`SzPOTwXU%yd9SEnv?#Y*t*A$G)DNv%WdqJ1+O>-3ty
zOML`I+l0rVKUt9^Yiu!JA){e!C}Y)YmKb2UprWCX8@J*Yw6d*a^me!AmW}dp?xfp~
zn$U$QAH&JrvNzRKuiHBIHPa8u!)brsYMMcM4mC%+W-=FwPaeTL@lh$9YJl9ThRT)j
zB5qG10|MWev6BdD+YCxSx!hZ^8kYKHyADq6I684?;w5dgW20@$HS$S-iusyw{OPzE
zngho#f}DWEQXK(dKbQ<t#Y1+l+J(?cJa*r_be{jmS-|bUmV%~dC+jPW&{KeQ?*9d`
zNoZPsp71~XI_a%1w-zs_qG&5`@vBH4`V@*WP|!8K2Y;Yb{5^C__*AdzfwLPC9glcD
zz@sW90%jFY-;Wc&e%(o-p+FQgBUuUsOGLO}0oH6~xipb#0cvVTWR-b%H%pwdb)2pV
zj06(LOO+Z+9Qm$K*8+FT7XeRFCU$?X9GCAFOn`AxW>7?R<uO|h%9416tXFtVE2hQr
zvf6<zGtjZ~5=nLdkpQ~$tz-I=JPtrkVhb$+O~6+%g`N;&?2;#j5$Jq&KL~g6Lm`NL
z3Ze-!bW=J0O1JFFlMmy>kyO5*NAi>)Nx59zJoSrS(kGh66dEOaf_AC?XIiM?(ls(I
zQIWPHI!`#8rE1Kii%edMb%%8i`lld8&nzwb7bIlw<;7NSE53id<6r-Twr&Gxj_3{<
zkTsQPLF$iokY!Vv;9@@pTiv9^!rjvq#L9AQ?R?~$XWy;XvQlPUt_Okzg9WlpL>T-v
zD?kz@t%Q!vfJd@nYMzogM5mC4&NBG#53nu{L;WXzz0t7*#?dBQXS?!_{kd{G45-nT
z7E1Dzq7$JX6A=5fnqi=Qe!7ppvDZ92Z2<{oABR){xkVL-Ri<DXV!LQMEF%NF<gR0s
zj;-}hz0!mgLn@f0g4i~`JfSY^R`Qk9k1WFHVU!}e0HVtsIzsEFb29TRiUuC+mw8WV
zoAvmBQ-14>WDt#SPZa~8ds~9RHyD;pZkjI|4ayrUeE3$R<UgZT$6lcq)MKhkr+e7M
zV1e8BgFa6G1FmT6kdHs1u3VfUI2Z12JAporlH_CF<0UeD@Nhj)k>%n&zv>WQ=7AV8
zcN)*?b9+p6UVg>ZrNm6~l)<b$xJ{nksp?X2iK7^$YW~36*Xp_VlM=8?Xg=%UFBZO-
zvza!rs$g0_MO>R2_-PV_mRZ>vA)DKtKv6O)Fpporb?H_g$TQAHL&L0t6|{&ez!Waq
zRgnv(!gqlahswV7zY+pl4<%*?7Y0lGoQ&DRp-e?IPi>79tcK1aI^>XUnGKS_gI90B
zxScn70Y?iW=~N)cR~}08^0R~pL+$l}u)V^l0U~QMU_tRxf|}J&dC8zLv!h$043%RH
z|JMy`C_uRcKH-|r1o7i>KICnJHC@Q#ameS&MyTx!`iZCD-}pqyf5xAVFB$j3)}?9-
zv&tvwMe@yv|EQwXbjLrkKM-#w6Ih4e9unBx9@6UoUs&L7UKu#BM}sir@IXF6AvYoF
z+rEFj91)mXt4vj-8bORy>~nXe^V_P)ijT4`o5X59w`!)Df1Uc_sU>U+RSJWk1CEz6
zfyPc_hnDG$rMX^*)7geFlx;+`DR8_uN4^Fvs*TjQLSvojoU$+Z65s;p%vN}M*={!f
zIA<92_k4803tFK0$QJ1#2;61~yfexR`{r9<)Q?8kIDD&bgeLGDa<b6TBj~&?0I?Xy
zs`=d;B>K8u478rW2b9^%O?mT(GYSC2tk`IVl=)&%Q~$6b8;Y9dCyfc~%!$GVkpnB0
zuCqsP63I=vbslB@jVbE<yY?UqAP8hy?=|)Lo@sa%3Ohw3XnIUZ@ocA*`kJ<?hm>GY
zZgE5~Y&t+7gZq0m|K-b!sU_@R8UTJ_k5v2zG2!&{R2)auw#m#6L}+P7T4F@FAL95U
z3c%nBKCd)&cYO(?pleMZ-@+i<qoCGI?!ITq{o#OlJ#pUf(1GAv7)%pR1ZO)erQ!Fp
z>f#B9;*sW@nW0neR>Dw9OxXUANVcZJU*{h$7{T8wbR4%{S&7yb&SYPP?O<2cJ6<UL
zP5|w@7KwC;O*0@<FljyGyfm8D7%J-|s4FK79X|St>lOx-8km5Gp`xGpJW-M<T;x34
ztKb)@Uye-?LpD`<p?Sb=-~wSXT9SWPTZC)=O(7m6pSQm~kA5^?RL>FZik_Yky*sq5
zLjSpO4i#=BMxE_BO6B=?tjI2t8<o;Mi5J%CH4Tq~%qydbCGd+<Mkn+>smaRxH4~vd
z>W_nbjHLJiZ>V7A4W-S{gak<*W$1XRFV$Dq^QxhkLp3vB>J6%|X~LpeO(-RMOnRuS
z651bHZEP=RB33=mURYX&AkKHdw#Tw^e!3V6lA@$I6(pGtyn~$^_cx#}$)#vJy3EG#
z-2t!yVI4j23=)~nwP=MIxBimlw!G@LZf5YJR(;*Duma8LD+h@|{@t&wU{U%;J5F6_
zg)_Om()Z~KB0biI^7ork`cxpO@FU#U?L8K_dK)A$(FHkx&X@s+?7hEP`#uMT^e<M(
z^!ol@qU{7Xu{fnX!bPQIprfi-epAZkxjc+>VBk2eZHrpCI+9CYC^9ld7WNfm6$MjD
zqL9xJmWb1Q%B87l1Nwx>6&g1ShcuFYxY8aEbJCQ5!WwX&3AmcKnWGJP)D6-Vz9tK<
z_%rx;%Ss+W=DE=seA4}k0%A4|-?jzEOggJ}wb>%ZD?r&N>f;X(XNFVOCi;USB`DXe
z804XsljX#C$?WiQCDimp><*||ba=WGvQEc^I)7fahuc7!$Vi_n`qiLiyY$sH`%^)1
zyZP#UEHtjIf+xAou5oIZ_N8><FPfu4Vvt{|HODvN{>BW-%FlEDcZwPJWAotteTSFD
zx3snx?@tGs?>d0_KOI1^YE9$yad*@5CKa5bq=(Wxug5V(lNKX#CU907nPa?Sy=VIt
zOWAgJ8gLtDXQJqzxc0m6D~SYcUDcGNb>|mlr823eV-WAYb45C}K+!;N_@lPOvf2a%
z`oLuHluClw_BK-*q7*I1+)3@3m%><vN%ta7sa-|4>=6smWh()56ApOiN<<AtGOIW!
zA2X#VY|3s5^K;EbhMSY0hf>P#JdZeweMJJ_$wss3X7E1DSyMnwAE#_Yz8{cJa<V?!
z=U()-LdU)qKQBvLN7ZaJSPQ!wy{}}iua6I|SKHs4sPGLlC3JEQx`5*}2Qlx1p$j7U
zS8f0(P3|MR1iekcw{@y}?sxU*(t(_)U;N(!1<nv9Lvl7Px+2E<LGNjLD0=~Kb&1$_
z(!6N?B_<6$@5GM#u}sCl=&c!xm0FxfUqp;huFuxqVY`+4z%{)t4{_CC4DWN|v!m;8
zP{{Igw8KnZc$On^S&ofnU~Jkh9>336*MeW5Ckt$9mda{coQ|8;jd1b%7q&c;%P-i|
z^Js`*_Hta(yTvv>MOWv%NS|x&{kb|9p0)2fNo9JO%6Yi_3ZIBFT^@(PjmqCSp7qkc
z_xT|w72Y>z<d4)|H@Qk{c?qgFtRUs^?iR}Z3bnb2lKrQLZ4P%?z<>rNREea5xA2(t
z5&90M`}(PC)lPozmo~-McLlC_|KH4>NXt){!ue|}Z&iHR5jsiil1^jsn}?A`2HI=4
z^!O5VAtM4hoe)-B=_2S(7f5@xxNi=7GC;Bqo-%sIG9*42cH(ZcW6C_t6!Tk%YhfEG
z?3Jaa0E_Cd4(6xzS5!{AEc&-zO&0di?UYHFjgAaYDXvf|2E(fae0(c;TtmB?BKI5d
zGZXf$Qm(=+{t3l;Q^2pu3)bcDKBImC2)YTE_ZU<51k}fPTx-H4Bb=W5xxsoi489*#
zErg|9G5vyCAHoZI*vad~))f`3hfrCEzLMkhcqZCnPGZFjI{pr@eyyMvw(YT<wwdxt
zHlPs-b|D;uPWFFSUUg|SYTKM7tN1td?(y*$&K5tzX^gO>sE0v6(yKsn+t;(eJecb%
zwC!8uV=0f}y6hFq)uknQn~{OgxOs>i-tJ0z7=cqp5qfjCvs{~(w0^${khsf6_kF_L
zN#XAw#c*eQoj{L{AvM<wOU@*Umr(uCbk)-(S!6L&MF7@Ue$l<0&wlc_AydA}9(3))
zM{cmPz7f_^p8?XoR6)^D*pjIP^$m}-?>FUgtyvrR_7}oQ%Q!fSZKAcTl6yk<hvB0A
zcUL;Ip$U8nyCQQHD4p=EbR-P<0JD@YJ*17~y@)qGHBkxK-$fyST;f>u{M9AT+-Jfm
z$NvY9P`Z7a^gb}+gDdtV%InaIA&5wNg&+U{PGxe)R-7GPT0O}twZ_u?TDz?d9d!l=
zR@hwbL}eZ%FR{IKJ`q!SG7Gx3AEVXApda&dnfp~?-pA@+CAj%0fl>oUY+;rO^3JJv
zvnGf9LHvxnQ*2`NOz$a#tq6*a%wrUn{moGf8=ktG$t9>SJgGUEkG+THUNlrXSun)=
zsBhq|)JGOp%PMobhUVRS>V8`Z?8_3k?G8X&xS7{GE3KW5k5>;^;ve8?E-LhmLKG+X
z(3s~Bqo_v9-aRtyl$Ja_Hvk=rO-cw*E}KTIP{*g|WA@D!<^B0CX#M%e)_r9bM|Y|#
z6Ut%iByeyN!hU}XTP|!+GxVjcv(yY^jcJ+vto{?C%P{g+0aQH@_(!^SOx1I8wH;~x
zv=|D>b%Gv$im2_bjhnQDN2!7}qXKD@3Hn1fVm4+YcMZ)YVmG;Y8qS;EFRf*bf&vPa
zC?r>?NuNgVuAKvB%3mVv`CG}};d#hBj<Yf%Xi?2}(31V1pgz`{$mhl!cW<Ha^V)*Z
zdOrnvUB_p;<QzeLdM_NS14gnsJdXz^O*s30)MI!!!i#?aq&#8XwfRb;mR)~6sJ5nM
zS1ofZ5_V9^TEi{cG9)k`2kbM3I>#m1R#Z)_rv10$ko2$Pz$oL5w|tO{#60n%%08Y-
zg2xW)DqIX9jH$UCCo<)LDyew*r4}K(Ca}W<5=h4M#TyWdbtyux&#$4>g+>E<p~58M
z|B6V#JP!x|zW#<c)Vb3bF?Z#nUZP-G6j4nN0{4;W2Pi4&W%rl#{gOD~xLFr2hBp*r
zIIkT=hkgVXPR}GvUH&yPPQpT(NQPo6oKelD{iGM#obxCdKRH~ikX5JA@mvVJ;h4uL
z^_mmB&qd262mF9$U?h)-_TG#isUh><^l@8vj*jf%TpsMM_W=9uo6Hs!DUk7~)Ou8&
zHsT_1y;{#=Y2a&`6-8`Yu@6p%E|l#_wFV(dG^Xt&CU$$u|5vT+gZ7h-e;q4_I}8Za
zXleKW=dSRH)~$EdwTr%10&BM=-+^&ajj!z1Zv9+$xgPGv!6?FaT==li(!!Yp4_fh$
zEFLnKX02zjX@gi$@M2VjOveq3hZF+)0|K=}3kInDhe|QUgP1#(ce&Gg!nwv>^G=r9
zBX}IyrwmU42fIWM<RlE;_JZI3zF-={sil=jUnB=*DnaGabdA8^v&69)N5E;<95RtL
z7NdHFs%G)J4iS6rlRbK=<D%6}dOzU2b#&|~y#6FJ@$A`_7s-ToO^)_H{)|dB%r8H^
z4_X(v^RO#|0fW43NAk0+$kPr|t&$64-FKlyht)C0Y{NE3oMFsU$+5o=vBNQWtr;(m
zs_QTW|7RupM-s#PI!3+e0J8;Ki}zKLw~7@V0Ipw0aaF{SLLxN}J}+G<!_CURtCE;<
zX`#bE;^mVTgE(HU1toP$LdEVT(x9J51$o!$R3>>vNWh&u(mA=TRADGj5e@ZqKccD_
zNPQ-+6$y~611r1hmCmDVEbaS24Ay(~G>>iFNsPK`h9M{w(j$ux6+>Po4O(g2|1Je;
ztr+&8l-p4ySL}y5_c{+Csp&yLTU5t2xx>h0V6Ixw=z6p{9$$Y{fBM7a&1O}LkRipw
z!)J51^5)wCSh|F^9#!jBk*ps{@x|YQhMX)wX&L4d2>@q~*eH6`(lZUtU341rRq>>4
z#CY;JJ;NdpzmwLtB26^td=>j|0rFmjV2o&e-A-;zeU$NbrO$E?SBc-D%S@_B3!E}`
zN$~_5_f>UxG=X1^49xf-_BWfZ7#3-g#SGCaGF4Iq{#!V9nwV=$tG5CR+G~AGbJ&2G
zXta6S&l@G26fvZcv!6*PRjrIeCspwd+4ElFyloU}Gq*N~!Un_ybHnme?`js7f*0pL
z_LC;1PTA&t3E!*`%)qj+!jho?kfavdOU=@zT}TZbiLuPOD?ZiSb?&{OikMBAi$loz
zcHKI+nDK~M9Num;-q-|$2oMhmH;@1J6H6LrYhwye76jGR4WY9O8-uJCZv|etF-9%$
z5+iVLHqdh>UC%WQa~3zJ$|S$0;Y>qme(}klFT1Gt)Q5gj;Dcsxh{jWy%y153h<W{%
zd6q3M=mSbe?Bkp4Kvb~){)xK%F0TVuX`tHc(J%NodF^`rxE3rKQ&xT79v=~Q9Qe#+
zMbCS5MRfRaM4uMG0OvXaLjx-8;h&ZG*}HdWg^>a>Dey3yxf$ma<(2Jwq%y4Q-YhOz
zRtOLs&TTRf;5;s0PAepH3G?T5Hh)e@d{lO!@Tl=Eg7Q{6Pu|4?&;;%gFOS^A@zBCr
z4y6WVV}QVyTiFf$xaL%~!~C&<lDaeI>PZ^M6f;J^uY9}9T{k)z>Wof)3+5!x`;xd1
zBRuu(l4JmJWZtGjMdW&9*+qWK>1jf0zRXlTv0>Xu?35ZFg_(Njlo(VB#yO-LoV99z
z(~|su<Hhq|5prw$KxtR1F|wW~?*tVwJV4JV0lUe-jCCqrSU6Vad=&%U6MC}t+-=k;
z<znde7D<<Bge5G@!<e=CauXM?D>TMmoJ#E*)t##VJA>ff8o{LgcNx!&uu_bXk~V&v
z_o*-^Vppp=vQx_@igE#UeMg||Z!xs*%`$Wbh@;^=)qE&Koa%>iKyiwZuZ)}D2|5^5
zYY+>4LtZ^fb3#q-J7syefwpnIpD#P~(<viZWaS1`*7!@t8xrOgoU$>(-C5Io$Kxy~
zD=)s~$TK4}=9%}|6Yg;PF)akL|7SriHUdx<Sa%h0x9Z?L%?7kSOdyi*H&3#Z0@U~N
zLMH|J2LA5KOEv=prO6t?6?FY)Y1#PS(sEyf!J~vn;fgc+_*&a(CK4fSk5i#BuFR=;
zX|8e~F=#Vo)tSUKMPh8PsKiQnD_-x9_}B^@rJ7iOEDX~NgqX&S=LbO8x*#E;V@k_7
zHMI3;k;N}fn8p_aQM4MVoBU7mqrr@5J|d#wpz9t0PK-0NwCHSR5>1n)51#bKlPznC
zzO|t92fT<MK&3YLZa}Jodm;uaHiIAdo7jD$o>Q=sEDrEx5juZtJm4C~@O%;Bv{<Xo
zkZaw2b;)Cha%^mTv2FcMpN~Vp4$hjD(JC5BmND!p?!bYf?P{V2O*wfn?9U<@A1aE6
zwe`|2APqjEV^#uYccHF$fZuLcj{ksJi&SgF_Fp4=v2HuLnaFEtmI;bMqIN-hge&(e
z`XLcC3~NbAB_X%V&@D+i<k!Why3v%Gt06@yBky{+)RU(h7&m}{i0%D18>I)=WJs{Z
z#6+!8N~*@uFtCQo9}Mx7t(UULBbADMEFau%HB0)=JMzd1mt0DSD`B<pVn2FG7Zh)p
zS59;9+@ugvWMlQY@YMqnl{Jj}pb}s|<ncYVW5ZIBfcEQelh5wy6>4b(ie5^A6#l-Z
zM!l!J5*<4qiWCI1e|y8rdOv4WBfXq~)}M}ljSFtinBE*Kw%vI|S}URa;UeL48mMZ0
zL|S&+@Fm*4$rbe--Tuz!o>CH|#Xdzp?H%G@ke~WrZW0IgDg2lL?^0$}w$!4Hx9U!O
z)W6NXu+0OG9M-|*i&+6a0^DVXXLa3jc3b=!8Z47DT?GZbsFJ>KEHlIBf}Bw}!)=cB
zUWZ<__T;_!kC?0@s=m=^w0Ou~gtjxbv^j{dWyfr#AW(mCdn4OO!Mb?Dws4WWPc4R~
zWagV?g3?3H++_utRD53DQ67}8L~EtuB6=D%_*3B=@}x}hBn08KYh6N8Q5rQs+e<&<
zDB!#AR3Nue$RtHR{wRvnWAr2fn~Xpo#v;eq{_nr-XRS|cz{B1TZ5MwC>}u5cM(1{|
zZw`x$8^1a|Z!57hSv0LZ+7cL&?;)C`x_*={Z=7`rWlV^p0J}ZPbVONo{7aQCoA*@N
zyU<|0-ZUVfkXlIiKP+7Xcb;9-y<^+9ZQHgRG<M@Owr$&X(j<*--Em{vN#iel-tP}w
zYh7o}nKQFz?>$-@A~E69*RUvIa=)(~3&8#3RXukjsLmOvt{Uk0Tsz;|@viH*9|E6V
zn`no87Hz*ls{##ch>m-Bc#jt0_HzDi8Wdf0hQ)O;IRRto36Fn8M%hHrl$2^qS+TE}
z5k8fFSF7NtDZ*93o~bsUNmaGE-$s|q@~4l1RQ8PBBeh2P;_xg~^e2Q({~>c7Va9??
zom$WP95Q?+pHvN*DWNG1LysN0RRvJY?mVzqQ)?MIBd~t7t@2fL1w)7Sl+%*Y^SkVF
zhkOo&*tmc5S-9RFt>|WmY3bNlKg64;BU*8|UDwje<mA+DQb2Qgk5fK3B*~F{m*6!u
zPJ38;ADP~pA-UlF5-9!?P~ZKO>3rT@N}brMx+Z{hq$$COJensVo415z*R-`QHZ^=M
z#1d5#G>0q8%hw|)qi^>XF5YhZd-}7?zMr52icvIDn&6TamPCOfA{BW^{GC-P=Y&K!
zA-`1HMinVMR?mkW6o+N5ybYW<F*wF@Shhk$!k)>=yZG+dh9(87!ld5M&Hy33Mxx$w
zrR}OH>Qx+Z{4a<BXph&41&>qmr5E)ODAru)UGd}KAuSl|t{^m=96;Br<wx8XGU{_-
zV_<Wo7S^Nc-+%B^I+hLpZ1Sy8LG#Fq$oHBt2O=8WtBkO-)zJCD!R|V?z~*swPN`Xn
z*WOhF{X|1)h~Wl@;KMy{Bu{M3wXv7NB0xQeCoU~{_0}sn_f&2_!55*#n*XARH^fu9
z-0RrBTS=s={ZAn#iv_cNOk>dE39zowRz|p((d()TSdCUHBrAb^Z7H^%LLUW?5TF!f
z<)&+GiVM7gGE#kqV43PB6hL59qY;%x&Sdd%$XKz^RfOlDLKX(G&vv10dNGn666s!U
z2N3$Gx)l$rZ!f!LbI-!7XhJ>e!Q}IK-2Zx?>fD4lgX>_H=XU*5WlxHZhBjF-g1%J}
znk~VOq*AAn?kYf*!N!>ZP<G-{g+$)AUH$FzYxSb*axAT7Ys}9ZwRCNMx3ESOA6d)}
zM|sX!uLW02pRp<Ef0b(Mf0e4)7}IyYXX^vn2N!9->biB;hw|;oJ3z2{@4f>ZyUMtu
zJ0FJEThr@B!b@b=c8Okfi_;)%eMw}bHhT=qW*=RL@;=(W0WgKA5cAVnWIP}1@R|9D
z7K+#r*FU?3GDt)pku(KAi3wMdvC}@|EH<K~wBHZvG1b$Od`oGi@JW7n5~kdRv0Fyi
z%q(Cne%@3rw?j{VdY$j{1?+M2sXI-Fk*A7OmL`Tdp{ztt^AsK5#bxCb>ehf5#?<NU
zY16B2!)0O04N1ahF=c5GdL?ZsFDJKgXYH6nsT&_`@$^%Gh3DDC`Hl~eNW9A<))Rd^
z&A>Tt>!$nQeR0FI!&<8Gj~Dep_>b`$Cluq&)7BQ(U;otk)1;(P`<Ih7u^sGj=3si7
zqIT2_?gVFpRWIWfpZ#mz-(UKj0zbe8FzS@TA1ET5mcGOw^E^w;Odk3m;Q%UeNyZ{3
z<8-wia%GwQd`?-8AF^$eYQ~Ypz@VaX3ZO@JYPCyl`Q-h0d95n73Ri-7beyTCpq#V8
zpu(Z{al1-hOfYxngv&ja^ic{%iR<|QQRl?weN~T{+8rCG<PW)^<I+IXfU>K<hE)^<
z?4$&REG6gaLkYKyfP=lfmu7{TKKTWY<24VqN-~PTR5P!+9H!#~-k~k8k=M^d=h}0!
zu*+`ua!xlF++ZA2PjmZrP%{r$eKu^<T~-&TSEMOHdB^|fLNok38NJju)K<0H!&Vj0
z{Ap40z`ez6boWajyX*yj1-Z+hL$yREPE<;_MwqM{k!$!_F2{FQPnC|Q`msuXNqyN7
ze)UN$a8l2PC*WYLPX(J71Ca&1v#f+2$9dQslVzUBmBz+qOaHe*NkDO+Z!$2Awvi?T
z^M2%o{@fK`=K0Kz(Ws(XH{&9lP=PSc#jpSgz+pAM|IG;?U#BKsearT!_jxqqJsp({
zwpQ7BY~*lHaS|!f8_#V4m2ZtUg)Z|j#lW!AU@~$(SdpY?kJcL;kpW3(JL`7QK55^5
zBH4MfR&|&6aa_i+e2`0Jh${RzIHp$SZo3-Ji5n{9wo9&%1~dXp=(=7f8TM1iSCM~F
z#NWXFm8gV%gkUYZhaqPE{(-I;YZ5BSo14iok65<JK!u3HcmD19N=0qoddsLJXHA^i
zbrT{1tLK;nw}TiCI=YR8ei<!+Z#O980lD(v`zNdrtiP9|(Ic#ARaM^#4rN#tv%gCQ
zY+wLHJLFXv1-n$I(i1==QODGjH{X?xeDj*Mp_r+mz15Rc45O8{(Kph{<+4y0GSs!=
z!Ut*bF2smlqTdg%elNyOGwAw&hX*f8z=R5=($G>+dBdw_W;#@KryeXtR=7DOT8fcF
zGwjodcW*U%L|y#p9&oeWI^FL0{YN<JJimP8bkzGbiHX7aJ7Gp{j!$bLX2qR+ZmyF@
zhQ_HVSxQWg2caLV)$$iKSo{wn8JW;!dKlRsSWGaqC2Tvh2|GVXYdOCihQa&RzvDdO
z=Tc!Lm4%_FU9RWYZQB)AM&H(C=GATAFZ)h$)>lg_Om^8E!8|T3g9BAv5d#YLuC+j+
z4?@Abm@KFU<!6X)t`Z+XjGkf%o*sF^=teWU^P0>1i37JyL(LIjtv^*-z*&?qSj$0)
z0k|4c87!UyiR=wG=N*&->hn&BQ@)G1^l4rQ5PcNCGYs^n;zn~2;u7-)Ey3rfNLRT^
zV^}Xm`%CO4YDB4ke6aV<0|56&+eOJZQ3f8*2X2c8S}Xi9iBhqI6fYG{qxTRrooQ!R
z@7`%GsWUx4eC<E7w8Yr~-bYx(r17}ltXwszls4WY?*6JtcPjqB%A4O{sZZ8dE@ZT5
z#AW=*mfKk9@w`@FE3>(Pr}&uydhFtT-g+bzYkNIcyP*Qju2eTuz+~MGFP##X#Mm+w
zkV-O09btQmYbH<C(<`>8$$l+|0Z;T}(jyd@xQo*pGS@GFaV~@W@NGn1ha;Q)otj70
zP=Cr>g;s(2fQD->8RJ5lQ{ndQTx=>Gb(VbbU;>0!P+1KHVGjzH`-*aIu`p)kN6!J%
z`P1iHon31I4om%?_~oZ=oL1KDZGRw~c$hsMyAWr*qa$QSA#L3hG(DKRe1by8$j(4H
z4_#!SlRR<%WAjWl!UJxe`~8h@``hN~;xnTF6GWAjDy6P{<fd-BxRPMpC*pi9lm34R
zcez!48Y$Ho&h21D<+VT$_OW`?gmYd8{)BOnmlhtg$hCNUwzfcb%PnT-FP=x1R9rQW
zM%$(MI^>T+(J#e%uu7_0T|}s-%K1`<dZ$<c(GkPq_lHLABKb-Blc2u8)^d=Ntj{SV
zna>nFL1bZ?*+r>^n>ZyC>?CT=R>jHZFkDn9;pu>173Qk5#L@vV)&iA#%tDemHwxKL
z)?<gSXyQzHtw{h26lNM!Gn&l#h7@5@nF02bCo9z`Rlg`C)~Lxg4#+>e{y3tpI%vW}
z%xE+P`6ua!f&PwJ=HqYUgGZg!aq@(Mnp0_1SVvfG2&@mPto#fs_V4BoNMAJ>@qabh
zKf{*0pH{EOgSN+nO+R{CzcXQHweDJ7u6$28fE1s@U<EM$U6V~>S5?a&*yw`2h?2K@
zI}fz`ZPrg)Q`tgUR>LLjqDGrYOBw^o?HX=M9<Nhg4wrYIau;86Me$v3E(HTen9iJl
zVc}?62>tW{BUVqs_i7jEgd@!Xr3AM=vC+SZ{V76SoWO{0Gn=Nu*DAFFN|Jq#u%c6P
zL-6J%ZuLV7>6yC`cSQaBn{e*C75pu1ODShycBXDSB;~ZJnk>_NmCg*v*bc)FF|h%{
z4a<+{;|bkJwmSptDL<S|<v$NtLNAe6T`vlLcL7v}2FC3Wrk1TVo(cF0ZCbYQnpHgr
zGEDzd-kALhiwVegV@@r>_fcRJWLaz1?`YiXZZU4zOrkKy>f&xs34y8)MVA204E4ly
z^pgjEWk&5BdZnwsg*hLyR(KuV_9+*WZFQtxsuUZGpxBVue>kF(yK%V}A1<v}AC|WN
zh7d4@PMS)cgg%cRn5nwbLPsFdkF$jbSeBLz-V*_oLLT_@=Zmf{#YVuHNg>LA%`|4J
zENB#!;)+Kwi^nVpHy6k*=YkBJ4U&#11-lyGHctt2*ayS<VOD!1mV-kCROhg2Yg|=w
zjLj;S?PR-TGhQ)O?^m45&Ji4sKP$|vrGC~uz5S6hyo=8BnE7~EE;e9ffQLt1VTb=6
zieB;B@+8lJM4s`%bQqSEs(|?y^xv!rjvc7hup)OS<#!d^aHs@Nw5F6N2PeRoUCAm>
zTLRxW!i$&{5#!Sz^cKIOubh%Hqzewf%>T>S2QRkN@K<}4w@F+mOB%y2&}Wg77Q{3&
zMn}~w(^qC(9*2DX+X3teqeJl#HCS<ba(Az=qAocgw}d{wqZ=~MLl%BkB~k$(bs9B@
zb#`6Ni(uR2o#vAByXVE2Jy^ckWQhh1hhh5hYea8VwQ$AiYMe(P;5i8u%A=<+hm^m4
z-aNb5_?Q+ThdC5(TyrztVm&nmo!H~}hT>4kg)Zw1)k_O%KY)w@c}vF?9oPKHCHLFm
z-Hpo*+soEvV8ssU>%y3T44;t+3C5WE{SO>^{SO?ndx7|Iv$(g>Ly%azf~trURx$i+
zfaXvh_T7X|Rg!R0&o%l3l|zxSu@^LNYG=1BTs{90v(%~em)~Y&69it#vUrDX)let~
z<WJjkJr8lq5H`jH>MG?R6{2W0pGBP+cx)bz<8kXo_@rnE{NmWb(|Kansc4XdFG8|Z
z&WIEb9C~mtKqlAj(d*O-hS%EXRD%58;V&>MJU5cLlThd<aP#BR%Z_4!8?8BF4wEc-
zCPqz^MJUQ>Ns0xcx41m88(e_Lx8&j_KdS7G7%yC8*)XS#!OVPuz4W=(^=LdR_ip&*
z3$veEmB7KP_DlS;!<CbnyC8LyYryj{isjumANY?!WtjZ+=6b6f@^->q9Z&wam}V9#
zDQFlbwL+nhKG<{?GzSA?9*_*@9v7$OKJ$$U(;lXsR2~@Bh>#u))WC9Ryw9W;tT&DE
zoC&R?CMF(h)hxb(uz~9xM+i@aCw{&*QI$*dqyc4T=fVWmAk}Ec(BVcDUVapokE73c
z@!)fvxkyvVD3@+`n2PND0x(yk?zP})X=!dqR1N}=BF7ob8>$(>*cDn>BTEinKx@<K
zQ$XJzeq-LZyZJbIP8MMcm#LEEjGPIVJr=6wGkDpJ9$Sc@<agP2O10_wosjRQ)DI4y
zem*q#d9Cx+zC0^USw16xr6_Is&AdZlVIC>PA=&#b-p}u^c9mP%{5ubu|5au1K{>-5
za<i_lKsxV_s6uQWMSsaAWuS07zk-YMk@MyB!A!|YVW{MOf3*LQ6B#(}ujwxu0`A+J
zQ5F!0nnYa|E{ZFTD@LybEKC4+SVhXzq@w`X^Lmwo5~CF(u}%J5@CmFy<%xi#{w*yc
z45wtt$Y-qBQ@bk`)bfECB^kfPRMLV4^oEmnF!u|KX&M@-%Wt~b;Wl7x`d0buCcL+f
z(qj?fAgdh6tlYYvGOr$}IQGEy39GV^UxP~ZwH3gk_?Hz5bZ)T0t|Phzz4f!A;LSuy
zNxErDeiFd>*0nyQJ@NlN=`tbfnV85^+IAxJd$U?rf`c37D(<;64Y-_qbA>pJPcCW4
zAOq8aq7+!v3u&Eje-;6lEaYpnA?hVqmVODJ@;xV+u^Tc>(KAh3gJ#^&#;$px*z9ho
zi~JMpYzZL&rK_8u+w3t?f{NrWsA2q4`(lGhTGQ4zf3-zhe&zx(1*j<$fZtS}1U57G
zMs4Wf1sHg=^(08MW;5!-$w&V_{jNxy_Q}=f(6M6QeBzZB6l-U^eYu`PbIWWPPP<s!
zt6<Q28#xpx83VQVnchmmF?~@5TB2a^%HBLHvpy`i{D$*$b%hwivdU#xnaZ-}oXgt#
zV^m5qlBbuUR(azU`!>nIWx?lR?Dlaqvp}KLset)YSBF^Q3-@tiBI5CUub*h5#wcmV
zvez3Q76(M~({fLIXo53j2-0UH7pjUl4a>&K@XKFj_@;f!*!3kK#-Uj`B`Hiu^0CQ2
zUU4WB&y!i~0NUe9WD8EDaDmF+58P94FH9@iK}yw#Sm0X2jOE8f%Lb??C+ZMdQj5+q
zH>36r@Sf_-eF!XuIHy@RNs$YJg_@9xvZ6yfC5DkmaQRs;aoTO!#QEL71C3%EI2M&i
zOMg{5yMc&~+X!%YT+D}04&OKyj}Nl2NwR3n;6}4U-`RjF7;%kUCr;p)^sXGnTC?ge
zS}?2FL<$zJ`5AfDamS_c{VO7@EdB$$5}BG)hpEa_!ZSh?i`jnl*9a)Oq>DZ?w;<gb
z(3++>Tni4ZT>V{~9i`*ZoY#lwn2#whRV!f(6suWSPb@p$Dl{~|wKs37x3t4Y?d2Gq
zie~mZU{^P3l`5jjIR{|zTD0TBFh<z-b1$e*#fApbH17IIL?~Muz!9gJ5R?ojTQ9uf
zYAKca?iI6FKqFuLby1ji4l!R<6cjYuD?#bpx!rgi1QzC`XGD^67K%qzVkVk3kl#Jm
zLBd(yU0^m_7FOc#X5dGNXZ7)dupfmn7`P0TZ*T+Zk*A~sd;D3%v&&f>o!guK-m3My
z1?FUZJV?c6kI3b^<@~>8<K55pwoSK-Kl5FAxY*a74%yA*Z4Pfd$w~UfAu9bIh43XQ
zyoDc;DHICQC9~h2D8fY#L6R8_??=G-#i<8)Z15?pd4!87ajHS@v|yb*^gN<j03(^2
zTab7*&=u&nh6f1oAlGvnlIya77~Z!<8AEXC<W@A8-oB6-j3H<k?RB_|NgU@#?OO7B
zBM%9FWK40`*~i;1maIJc_1#^oGbX0F3Z<i-qe0LE-ua~u#mMuqv_UF_dOHLXv~)|~
z*dsTwCu(wtrYfk3$IvWJU7w8O5`|z2P_P%4%$(GbT(hi(Q`vVYoj0kOy|0K@{4e4`
zCOv-t332l1SUk3UGC*A9Rc9}o-6|Pt?CdA_DHr04u^+*iFB2h{P3EM`Id#&hO5~>F
ztE#mh>sdVSvb<R~s!!aAQ^|I9YdVuTGOI1wY5V*=eJUQ}9Q+LJ0$!h`S5QW`z+Dr!
zq(?l7A|;fRV37_jBAgr&teR;lZIFcnRk8=0+z8e(!&OVn6OkbcS;w+9#+u9DAnqZ#
zKa(u6{b_E1v)in!2ES*yL?2z{`8N(>frz31-7`}#DYZ{sZpw<ssSkJJ8P+(p|7!2k
zk6+)5^mDU{4g=r1Nrh9dK;rdB;dy#XwU1Yf$)3old=EW(8Xt}XLm~brom8{?ihZS)
zkk5b%Cc+$CyvW|&AyAnb0Owy(jxoI~Qg3_Z4t-<49gY;=LK6emOyLcURQWZ1Up=H2
z?RN&*7L`G!g?B?RdH`&S&1mW>vqCC`|MwI20@T-v>*xNP(iAB<&76Ferj^2@P?$`A
zB_v4A*!!GQI*EsVOF=mHeb77^$0j+S5)LX2(|JcQ*)#}Mf6-(_(kxPta%98G<%^qz
zCi$cQ70uw5C4&b=Z<T@pL)%~8P9=UDZG!Eqec?47#nJL13e02JM;{#<MUR0fk33vK
z`Tp1*j5jnQV0Eke(i<QuntB42)Mv}jr!7Y}jb-{&aQt@kUzH)h{^_v$p9~=$XCpox
z26um1DF^{ZYgTdSE=czc=8kWQd6vC;CgLWKxH0o4m#!XDyxx?nUSRy{%gV`fYJKKG
zp{XF+;-p0;K<Cm|aTK6tvK^UdCSP-8Aj?Erg&qKvhcthVp*Sy8T69qdX<nFcHqjs>
z91w~9LJf0&GGN7k+(N&z4+Y~s98qgo?^4?sM|QYE@c`On8~EK)VczC7j<rmJxZe?u
zx?lSUAnjP+yX8f}-*E#y(mwXl5v@OZzdF&1v_{AIRKqD`S9Ll>VqD=csoJiRS-ni0
z2!g-(u10Ry%Fg$i>E2_1<ZSq8AtmUW%jmOa*Vw!8kCQUJP-4%>h`4h5tvhQ<Y;F>>
zWE{O>JGd%QTtzmTJ-IL-wmVM3ktuqDi|<%GuIe}p>EYOu2_Z}6UD7x;k`?p$!*oLV
zp^e|D3EIxXwNeTMUxQ^{`wL^UhzvhOr=cv2QmrwHaxF|lk&t7+!>0YRL0H-57VFLb
z#8lB?biwAE{IwThLucSHlazZ_Nentlov*TRaQTvcAsMs1jjwJw(i6Y#6t03x=XU4C
zG}U2)qH3a37o}}OiXZHoiz3()+A!iCi+O>WDP0w%oRV;=^%-wl0zD&Rcgh4?$!SB(
zxu#z~{pa@Tx*{~O<iAN^+3eSU=Zfxqh-F>2=svu8BLzrKsluq6VKGw5Cj61R8%$jJ
zB1(cSYFbqio7C0NCPOpGSN4=k&W|+6u~0C{V2_<d0g%Nz%Eg4K2!jz?Q?97@@zf7^
zw^h>tF3|kr(dHtWljVJ#*Je2Nr_)7$CSMOuwx6+9-LvYAS{r4rWM;+HOy1{KDaCpw
zTMa4q19L))m=Z<r?7gRLiiLiB?ngR$onNB0PP$d}gaw!tnxNG{#572&JnDxm$*I8H
zE28EEVj@@&9wy~TFMJD&Khh|E<wQ$BKVG$Ro>u5pS5Xr=;Rm$R&V$kL-GHJ)cxK9j
zpdYEwXw<&v5{PX*;z?ooJ@M;*G_QZT1c&^?oysQu96jn!1yxctFIo|0&Zpcs9epe4
zXS^An7M;nda=-RUGM>F22f+u$OaS%9#KyaRk)w31v$ML4>Xlm8%!5<N>X?8Msg~c8
zH^XCP1ge$~fw7MsF`*32Zf8J253Zf^t;i4|T<e$`0u=EaXv(KS<*FORgCQiI|C+dz
zPh!|v2HyZC8?72|2bVyyGAzK^-?`l9sXebFV;-~Oa?Cm9^YLGi0V)w7Bz<y*+r$6|
zVHE&mcmN9rz$84}k_^y@;jlXCga;_^i?!y#eV})1j!!oMl`d8*MbC&rzRm&d7k-DV
zUa?9uAb|0`G`#dVbFT9~xbk_saffvIe-na@fA^t7rl%L-u-phpH(6qM3ru|KrLHOF
z`<ZRz9Gi=YZ=K=&_1*b9Rkb46eOOmd<EUp-lO<(^;WL)ZX>9yf@7f<~Gq54*XS2M8
z7<A<G(aadrI^6*OeQeXzu=#ZxRFHE2f*oU7riv!RIRxQl071uOsfu8A4KK<zXh3wb
zPMUPsNtT)jUCq>^ma==U*)uHq&iqQCfS+yuC9UGB6>)Iy$S06b4HlP8u}w3lm_87c
zh^N=#H&}MKe<>pat7@oOu0$|e`ssb7U;*1+rd8D`Qa>e1b*JrCmKw%TITU)y$pS;~
zj|EqE!B+MgHSaq(!)*+?sDI8K-U(j~z?}r{c{RhP*pM8-+a<XP0fZI}j;BWX9rruu
z94sicCN|k_Z&l^SU>3eWW(0hNFy)<k#yERr2()r@wv*^4faXmFLd+i(QoZm;DYeAH
z<j*N;FBmJ_Z58l8KoF+dA9-Q(a^p4_TC<>fsfTI8(&OWQZME$Lx~mxKPhnAzi5JQH
z1A4xuXKDD%nacTC%Bb37469lL!dVgsIzAlMgmzxQl8(r#sE~NEzHs88nMlD3Rg|RR
zW$FhEfW{z1Gc<m%Y_?j4Gebj7psGFZh_v03U_Z14had0o3MpiVyYFPFP20+NsDRLq
zpk=~lDZksNnKf4hynh}Y+W!ss?oEg#C?PPjN!?dyD@IRVwwW6G9NOxCYZlOG*$ikB
zjiiv`ui6Fj;)+X7iBvGfF$2R2eN(U-%ID@g&o*S5rntF=q~PcT=eq~l)iM?Q4<<qa
zJD{Wj`48%V%Y|_%#$fF1!#rW$3MkiT^_E+-Fb^Q9tT8zZs<s4xQibA0aBpl(n7`$a
z{o2NnVjT14c>h3@cZ?WA%BLmpNzvbn&P!VSuzp*6KfAQ+DpEWV#-Zu-&yx6!Bh(EO
zF8aP-R};~4<;uF8Ys6!?PQ!xAV?6V;VMs!s|3F+Fu^6#!?S|m9Z(IxQ<8rruBa&J}
z^4@})Sx_Rm;QShK{s*VIT6uL9o>^DqUxNJ-bt!xi)&f$I23d!Z=u)k<=1=#1PuCYG
zkN}kK!!!J*PLTQw1~B%a90aBPOBYr=Bc?BT``2GrTIxiI<Qyj1;(g3ZO0Zr^<i#xf
z7v{P~vO!W|)4UQd|J52*B8#dK#=IRxa*mhFO&fp+(nfV6Kt{625-H#M+EOkh!$$+a
z&uZ^o>j|TfZyPuS+FPEB8Y>8lL2n3GFT2|=5Chc{V<Fn3$)s6O);(NNZV_Qup8?fo
zI{fP#>*K;x02#_{2KLyEHqgrC-p=7VYt2c^j9LKv2zr_NbiNw**+M}7-<!~6_KzJh
zk!ZN@W{F+Bjnvg9F?2$!zL~v$yokUQ28+PWCC3ivZ9YuE?TkhlMI8XfGnC93%Z2m3
zryHx0qfRi2L`pp1>_EqADz2Zgf>J_ZNign|JlUEPy?O*+v7^%~uk4BvTG8OnhTeYy
zj6um*4VNe~E4K^jfabgiV?*VNV01B3X@mq1f7E$;N`bHdYJt>>8)*_LzWYD{sr@1=
z($V9;Ie-Fc$?d|9m_m%ls^gHCo|!<DYfKdiY7Yg<YPIceQ5%IMn08eMB%L<Cy@Jf0
zi=|xWlz8PyA^>$C@u8PT^P&etO|Mlf51fVFe;xm)|G(^CmnT)HwE#ZK`C@D~NSpGg
z>THUYSy!yUTPcD}0KbQ?W!W*QpaQN1<r7lt5loYEJ5B?B3n0gAPuW$yC)vVLGfb3o
zG*Us5{l<Df3l6bcPtYB1%a#@bQxXqQ1o@GDiLWwbgP%LD=7O?d!&3-87Xg<*m|cX^
z1lWdV#&Dz&KS8c^!8e+fwMirl+AX=#H7=<jr0~Bv&!Rf*zf=!GIanp5ub2%kuO^nT
z2#7gldW)op*}EzVl?QoE$=F=_R#K2`c`aRK`a_7<2r9I;QKJx(3gn(fxWOs!nQQ26
zR*p~Wb7reas^?X2dRd9|f=g#x^}nm`BJrvZx$2(o$Ye2g(F1##n4dP|Tp+{qE!zpj
z6kGYmyTL>)j@{Np3nCA?U5g$75-~3<8(@V_$^UJU5FO$tumG4*FETK(zrPrcqda_u
zmrJ82Kq)B(3KwVjhN4Gs@FzqXRx_A?bA?7ZlA2t2L6$hASN8Sh5f5Yi;4enSG21kR
z(4{=1N*0{;Ft`zI<s7tWuw#O5HNdW>Q^@>U{bYs|t&Tu~*wtTUBjTGbV!Vp3d`oHL
zODLcT=Xg;H7Vmr}7UTSKo`chHWAKJ#`=pVrv6Ioi*zq(v<MKC)`(0L?>!sJ8Qf^tE
z#J`%4;$NH}ojE;rJm3S=Dm0GWL9ksc6xGaP)sdR0=fI+f{mD{IW~P#Gi%gk)>h1Qp
zXHO+nm5YOa=f+`|!rmCD&OK8n-YAD&6`eLh(F8&wS%Ayl?xOt6?mPG%C&l#9F2!wQ
ziKc3al5Nc4%Rme3y#wFAbRp)Ni7jyQc6~IuRgdH^kr1A22mB!x6}6C)V^@=@F24~X
zHWtg}uPD79!W%`3S@zQgooyX6{*<2sL7hBc^&pl2a%77|zXj=_wq9)@Ehx^lnhsZ=
zn)LuW=B1cSqV5dIzF5oUluCByWuWBI;$xL5r!G%&?MGv8$v@iH(V6U{C-9C%Xq)s<
z!x7*RFuX_suG~>w{50C);xNdNO0$?@Td^i2JYNVWq|_nL`tDPh>MQ2MLc)ro0&w(4
zUM=_9h+-|UWPE8vqD^{nqQAB{saO7Xvnxg4JUA-;LtqyMI)!Dr(N__Q{z5Fj{&IQ0
zc2f+IFf+QM4bi(nkQ5n!k1j&3Knd!PD54VW{w&qS5m1<~IHyuh7V_?4tz;$SuRseK
zX^C9tn2fqIIk#|kY`yqlU207<s&AfzS1VR&4(_GFGsW!13st`FaqqGXw0?m%K7+R(
z$u0A1a9Gzn-B9Y<DVtfIA^R`9|7o=DTrf_0rJg0o!WW@XW$}$Q4=@_k9ZO~1ckEs@
zVx%=I02bkU8en^@fhko$jNu6|aYlt=qx3G-1p@E#gK!=Ub(Vt&4hFVA^%Fc*r3LJn
z*ZoBppK1aWlEj{<athi>AhM9mx~l8piiS2F9~u583@I2};=3ZL=eOmuVPwk6OmVSt
zP{P;wVCSIhsua-Zr^sXXzlA8pSzZjO-=N|X`kMhdVpg7j=;KXYU0us6WGF&|>NTRA
z%C)UjrnYtjE|Hkcalx=vu5*?`?9%22v*Rs*JYw%jQT8@D9LEN5Hb77RFHPr39M|gm
z)^*O&300z7)_>CLudco$IQ$#qfvlOSuBJfAg2Y2xdBb4gedSEBXCS_lqwoWUHS=Wa
z6)gn0EaMCTd%xmFk9AATq|d5Oh-h^#7GXRjf0g(>>B#y+!FLd`Y@b6b%9AXWrt9uB
z<P`9g_$DMo16?B#Dtbv~mVW5H2p>8N<spLZI&=sw2+H$iKYdWN0L^bRU|Gm29t-7y
zX@3+<%TM4W*a+LeA0Ksrz`MtHWP}I=;!YTDzDRrWEHQAWN{v=8%8AETHL%B<fHOP$
zy$9xQDL}GgH&i^_KH<Y?)j;!ZZ{c#~#6GuC$tF6=L8vuAb0=?|qesX(&if8;dSQnA
z-y2@|&kX6#<rzD}wNV+Tn8tCjQk(!Z=J#=zWspq0>Ru$Abve3ET4u<W%ffGMRUdI)
z)2QMF^Y11ocF8G>X!X*wsKTdcAT+K(wfmx29|?xqB%DaR03OV6@Q)sU?76Hk3kf9H
zKz=s|S!Zo7a9YQmma<;;J0TF^V7jfmvddsv5d7Z}Y;wym!p3zm*D|u;0U4<hN(r(x
zn@Rx@)+(}h9zUX|!Gs?!L<2%a>VGtt`aN3C2>w~KA88d_HV><9E$tVAiNv%5j-j-m
z)3O+&fPomz(S!9$NK+sHECxLG#bfP^&}egm3@zG)Ug8Dblnj67`qEmpeYvaFK(Mws
z%;okDd2d7IvAKC(-}j4S#OW0E&C8<So<z>XY`@)j3E-E<DGrMq75qJ94~B<0By41`
zGjE`0yS7O&KubXn9f9NQsIkAGjdd(H(TEWxRPKP9RZRfvMcL3w5HB|ocheqB;W!SD
zW`#n1Jj-as9{SlKX99wLGZ?KqD~!f0em%SUAy~YJuT07(Z&hc@V<;3D#Y7}RL2LN(
z*hN^hxARl~aM-VA2OmfU%tvwUNNJNfccM0I>&d2H8}ZUwyMH`d8}HSa&EayW7I!Bs
z0gy_}8;SDw7GJ5e!@e36&s(fAMeJ!tDia6UXU%eb`C@O5NfF^zKjyW$%y3N^PWH6B
zoR5JX&HoeBr!Dz@>6+Vj<(rbSkS|tps<Ab>cVdUhtCsRdB~NOorwoamRDyh^5)iEL
zAGF;p9Qv74wUpu!vwqAsOi!i~K+8tr9OUM7C~9c<MKoHr%3qi1K>6)Q4yo&K_ePVR
z19kwYXN)A?FaQi23Km1m?4BR|!5lX0i@Z8IlElSgs<CqVDUqi4`rMQ8vChJg38`PN
zGhw{o{m_Yx6;`>>fnBELv=r9lWhUGDGEPH~nAiE2hu_f6nO#W-jv8XqCbJFl3rfoU
z3BxJc7<Ec13nVVb2AT#BVS|eodl#GX4L3E1(Wy`NfEwYJ=f>@MUl>%~%Yzn!z++e5
z7bz*L`RvP;@;|6!uY@**u1wSOy(-kvyNZs1E^&Um$o`MiQ<d}@RUK$6-_CLKL3Ie8
zgD2ge5I&8xW))`@)>oGyCT2F&xR_rJ2dTW8n;}W~W@~$tmn$})^d;4zgF8dd{*(pg
zNnM!IgW+U_3O{dor!qXHHQkW51Zd1KGYk+BX)`T^4_q^URZolkXhP4JR)HF1mEEb>
zg1j-Uk2qBQJ9%M0W}dF^?9^fuH|2J5`P?mFux0)#RaeUP$*YIEX%jP#dCFp93-ycz
zXf;GPkp}}L%c50Ae$O8je}zR!wdrU_%$}ig&~2-t$Ui)>5)ofy=dM|>cc2f_)0+zE
zJYDxYF|5Xa9s6`7^yB@<Ho^L@>mQkL@pd}YZ|Pej$k7<au9_gQ_(nO90tVk9eSfCd
z1E~2$g(9!w<63?ClwVtTm_RzQAZcP|y%a;yE#=TCb-y4&?(7C*;j!ir@x@@ucoki%
z?zh~_%~V7u@PIPEe!4wFGz~<KBNPQkv1Z)qLPDPS8b@jr9)yD~agB-d>V!Kr)<c$E
zCIo1i1(SX+jVa_b6o17S(snUPorSg{#fUIxWJtYWPvltu?D>@GK1$cwLP@!=2GG?t
zs^Qs_W3*y1G$~Yz)<`tYdjBAhuq`AxLjg{C&sd{AP~u5YHh?7?6!*qSuEQsa$4V(B
z&N1R4{Bp_S2jC40c6P9zw7U7;^c+ckEXMG){dn&iKzfg-+xQ<@yvGKl$5FxWXap3g
z+vsVj-{}44J6qg}Bx&k545<<3`dw-Q0-&;|1f@_SrrKQz68hcJ`}REqH|ou}h}I!w
zR^NB|sBcrNw9b0h5z@um)h3pOoN;{@6nI^UB<2Hjnz%9K0RG>R2L@4Ee)5tU)3=gN
z6E?!AnM`c9V}U48Y4AVrLBvjN>{_B(YBW+loY)tdFkz3#psr3>DhxPjTN}0OlNJd0
z*cTdtqM?7<HrR|vg`NXf7TsJ1THLJnQd)J_C7wRP;TuRaiTpDW;G!=o#DwrIgr|R(
zZ|Vnt?ZcoD+!H#W#7n~~z%GGi3D(_t{>t|6l}dn%6FeIRW(3u6SvCstn&*^+gmKxe
zz0^=$y<RHi<QXcj*);Zs{DX{-o?rfW9o0gGehrHTFi+w@C<S2Y9zZS+X=bDD)$u7$
zXMcO-Wnx?o>i<@)z}MoW>@S%8Vd-Ust?k?{2M)G~gTUA_&IH132PhLQD%Cb8>zvSB
zmt2Dx+q<Qn3)}wy!OfF7eEDRFI$*lbP?g6CJU&9z1`w7H8Y-n4sGZW>S06JuM0d3o
zhAJUhy4-_ulvIu;c!>8|Bt``Yv~{RAlD&359^b?gKmZggzB>*0=j5&!hmF)WEr@Zg
zt3NpDO_M#%BjhJy4HW2+Y(p!gg2KvNQ%|LUsXO8F<>Mx@K^g=5RrY_kl9qn^CRL|V
za>|Y$e=D2DlNB9Sznq|5CT^|DZh-<>uh-)tZyG>!$$$O+dBFdfJb#%4!N~oOi7}J5
z54Mcd^;9{u6a_W<Ah*q`#00;=q6w-UIWYU9>W}`idU@GqVZ#V5#jm0$zYaR8$b{}R
zBoHv$myc?iWB}qj2PmPy3w@E|2d4K?#D;6tMKLEEyn?4XsAXoKDD5amPvG8z<Zz2*
zdi?^&w+IzQNk@Z;a2QYs?_7H{H9F0kl%TzBj(l4)71j^4EFlP*fMe5$08=>b`*}*i
zI;;z9t7^65h8jiq^_w{uuU}3n0`uKdl<S@GPg2xn3c&fp^iFP%smh&i6y{3<p@@iy
zw^ZM5B~_=v${qZVj~;Tc9d>raA?kWe6Ea9!;~6JtDp7i+ebhpj_CcI17xu}^hcRd)
zeRfRY7<*5vTa@tqq+E{<e^|XOo)zf-chNv}h!>5f@2>K-PxDR>GOUzTih~$614ak%
zZXBc7Upw(-byVQd%{g>x;|SFi9p<D0gz-DSC8l=SPe<q$FKU$jWlGHelr~JhVUUKf
z4&9}U;I%r$vw&n-?cxuNEp>9%CjjwP6ct|+5xcQujP?9}2R-5Xm?l`MW-goW2S2Py
zfm|mUzuKi6pwkgEA%i*O-?1^Aif5Od(gMRgLnsgtGt&FA5iSoKf(lHvP`fT@S1A>p
zwq*MdzdT<NR!>q0IJ@c*rdnfC^#a_YbND&Ri6gG7t44#NXCX7ZWJGA=1{He@S}%=a
zJTb0t1ag>?MEA1!F^(a8L5TBDY1m7qfJo%p>IyO_u`V9S#QOFk2!>|$nR4MBpaGMG
zHv8z-`@T-^HJX2ZzuVw8CB2>_Ts@O`O~`F|=lW?{J$)te;C~JL(S*O4K#l$D61y$4
ze}1uX_i_8ZJG$d6Mxj5x>|i2`<63oLly-N#sgE)<CL3)k6!2&_mT6L&hB^e&(zRj?
zdq3a}R!?~<bhK2MNamH>svN9JlW=_X7QjJdKYNB3sfouZ24fGfKlMJ+#>UH7ye%<V
zQb8Cf!oX)6q3U9n*`P*f0gZ2&)ly@g1&TKiznf76#^OE@u~XG_v>2t-04BM|M~cBa
zfya13{t~W3mF&W)Up&VAWs43rd$x_KR^0-)o-$xEE-A$^(^P9K23LaApI;FjsI1Hy
zfW~hmCECHveK(-3*&n(UQcnD<Moq$Du~>$6uAD3-M?d~fOYL0iL%g<N;jSnb6!cP~
zzxg=CsD|_NFnFHTHW+(dKJor|+ja+rp4I8TU$sr;9p!1Q`84)={D*c~`!?Aaw7a|A
z^o7}ekX2Dr?$tV2CTB(f9E|RY<1+2=zA?Py2#!-S`hTh261C8xKU~@v#xP){#$_tE
z#E6SraJ$>W#3tu942fMe!=ROQfXy<J_<Mgj65AC_)r|Cfx(!1h=?O*d)U!_F{9}aS
zLKKvOBvR`NDOqL*S)dCucbJRUEA-x2U)6PrL5<Lr(N}&2Ku|hyGOM(*(zm`bTX2Qu
zK~CQK>&7{sBsHFej}j{<pHI+UsZd!a8$~Hx)-SYX^jeN~o?~Q0*z0#27o{+*TYLDI
zR5a&-atSMN5R{2YGpkLS<cL+{JU<`*Y<e@Mr!h0mVSKB&_(`j*YlTeplZhkPyCgJ*
z+xL8ltPur;7=Q}sru?K|3~t1blroq#$PlvseCfMtGVI>@%s$G~{_pty!%)m$>i1r>
z?BB=V4hLKgxoRGeky+$kWZ{|ZQ=@487p@wlmE?TFEzN^xIGxATmod@C9|p-eY5e<E
zPj>vtI8$aRL6L(dZp+z@pvunxob3?!<lFOUd*Gwgdf@TlXFpFB(hJ|K!B9c`S&bsa
z@4qG2Q??Y~3#%TeC=($?(hs2s9<{TRHiGJ98kyi;$um@pIJEdmv=9?G&+0gHn9fCX
zQAr>qnDK!Qz3d#O6X<UAg&)ktKiag^@N*gmwVgK%r=pNW4L3yTR#Qj5|1uYP?Ta2?
zgE?-@&5d7!>rI#|66nIJSkqN}Pl-)4=!BVRG#P2J3amlco}i(1kpMgCvGQX5J#Vio
zkw9qm8)4u<)a%*3Q|NV-Yh}jZQ{(8_BGv6{C#S)`apUiFf%E3i<K<;sF)I7<i}Tgv
zJGj<wR4p#xTacsZsgB0QO)s-95?dO1azW|}9wk;_CdJ98+?<7(w}1*ZK(Snd&I9g-
zf*iOiVja^FgNpQ(*UlyGSUTdFD|xX@W#zcVF*MZIzCE0-rN$SHaww^}_~b}^D+qtm
z`7x2A(ut&#%->kO09>ErMUPCe1f*;B><Xo_+9!T=0uaIITzFRkj8KPjkK7b%v;^VU
ztYS>EST9bAu08TbrV&WSrdJCtv_895eBc40RU0Tk$8Xp`51W>yRe0DDdtX^hs=K#^
zZdSRu1epHAP(|{*=Y~{Po1uQ&cyt#<!IB?AA^k;>BeUK+eo2KEew}9Ku&6wwH9JXo
zcz~N_y15W<W`sI_)ujEVPwVk>{bsL~`E}z_ZMo;2|KrbmZ4$(E`@dZ0oK&~|-R#ut
zefm87@{Mje-j{^+_`HzOsly~EN&m2WyZ(?^dTN(j#%^xqM;FAB;kH0*w<P-lZhzM9
zKAdZI{X8%CNoo*0*ccES`jlPfM4D1^FFh(5_I|WL<&Y#QO8V9S926wkyLFIrNH3KP
zqnvu9G&$Rs!BZ3bYcmWAZ28nadlNR`=IUPO5ul7-0<Au%0B1c^$V#US)6S6=GnX*%
z#~KWpVS2A=#OpC51ctApaY{4G)moC*A7tWc)k-did5)bCCmCAaB*z&K*^G#k^yd!+
zaG0$=USvmNm@P9^&Q($tw=wM%3vEx|bRBzz8S8l<6-c}Wp4xWtZqL(1Jf91S%JkwQ
z;$T9#?OG-t<EO-vkif0^YlBVqf%nJ$eD~LBmHay`IU6A>!}o$eXSNyLaKVglMP^lo
zA-~5c0pB~mPmw9xQOP`9TP86EibQ9!wj}Y40s!(7N!UtHb_d!6VJRFAwv`^`kPFtX
zwnkry-1J=#Z9VSa1q7y5tp_z767S4ErN>_Bnu3JfPMdNCjCVr(CGxD;o4}%4C!o&y
z#Oerw_urmzc;dmC=bViw;A-~w#5gvJz}zRls|)7=(dw~aAAjzy4q@<Z64jsFa(dZX
z2X?31`Hx_475V>0@w^TFE)5v`W~=3CEWvW6ZX0GaYqsBPLqPq8Dk~X3HY{SHYe6y(
z69?e@Abkr#HYfOH*#`)>Nr?8BA0fa%0`^wi&Ew~h?qRvO5eRpS;Z7}Bh@1Pwe5Z~R
zW+U=8dE!zO>}lrztbB|!c)#{|J1P011!SZBtK6u}Ki>F6U=PgNLA$8_yg!kX85X&N
zRoXei*JDFzQ_{HQk*bpDYaWYI<(6Fw2GwQlAbQkV$cb}Sx>oPzFT%1$g9;~2-<@h^
zGA0C96i$A_nOGkM@8Zyt5$juFEu;XEQ)E+CAIR_0R|!_tprr_APn{Lf&OZ*mPx9{%
zAoDFVLvQ%WOB<MoOJDMOpb~tFMw=B(ErAfRNnFY2<$fGVd!M202pUGoafb=eX$s4;
zT%%G%x3l(yJLzLVcMRoW#D6-5rs5VxrzA)BVjvF8gA-sgSeQEx)ExN2o!hODNZb!8
zl3j%h1=FuRxU=C+mm~y9CZGQZ*-g(dMg4AOoWmksNXbTGD}cE5nGYhxxi6s`i3zxp
z^O5y8Dd6_yiQ2IHJh}0+6LLGK%NfZt{HF+0v;XP~l-eVNJ`rD<n0!P`Tpi(LNK{Q^
z%N#6fn+4{CQKWzb^eVHGKqoHW9P|E5>jl{Q@#sesjE@x7D)vAyEE8=Mkr0=RQ-UE8
z{A}tmpe3yB2r=|eyWr;3!$I+Thb_9L)Sl~%;oHH!fLEJ1^I)vi`%z?)pUHa&$t<)t
zcF+lLV0BMFJ;dr06-r!~nEX-BN5#kn3SH1_Ip*rLK*H33X_((*1z3DCY83lWiynan
z-<mf8g(=IH#Ob1pJP+}?u)-x9yQ31=$^_EHt)Qb0_z()^>VrFS=G%QuPiMi4%kx%i
zz|2zJP@b1-_~YuGm{UoBCMOxydM-f1G1;0k((;Ay{X0`YwBQl{(BoO0eK}_2;^+MP
z(9EW+{$bYFppE$-NJrE51=5xM8T6Y$jC>6Bwx7!><umOVT0Tr=4kI_DXyb^MQ=5^<
z>hwBAe%7eGIHgl@@4n3Z?qAVc6XtEwpctvp)C9DoUSoPdiQ_L%_!S%_Md>u9*dMd;
z2vWR6p3;?1P~T4!34c$zq!Rd32OL{O`Y_$(&TK_DTp3?J+d){CMz(<(1M|g0eF8c#
zvB{bJ`<g$2zUAtc;8OE|GoH-d;t5!ypx)g0@<r>%V7dF=Q4=+_r&nDN_pb^p=(;%a
zSU@9J+{+E>i5Tesq5hLT6jdtPFH^sU@E%B313exwXy`g2uIL+fLrz8ENw&GMMaK#e
z%MQeQ%!lvCG=cY-$+ZE(x?+24-=G+fNQx!~C*K|?x+|WdkS-eYIxa(w&MeX&v*@qY
z|7}lJv)b@8TCb7(NFj#c`c=3jn=OGD-;AM3?xV`@p@2U?<mTA#ak3<5DEE$)$ce%h
z&G%Ut=dXTGtgPHDVm+#+1JdlGmDjH3hdB!V3)&SL3MD|?rBG*QgO9_+bYLFQXf@SX
z%Fkh_yJXg8H$VlzSl95=QQ&vlomP$n#*YZ8P+&SPi$n`$=o#=lH>j$WwRPbzZP?Zf
zQ;`j@Cfee>{YY2iK0s(wwMW>SI9gVA-k-a%YIDX2x7TdaO_n$|Abz%hzgk+GWPvVP
zq5hz*$FGcSJI9Jht5Zo2$w9bU9hO>e%fDV<kEBg@0d3@}m#u|!t$ZOhm1dmAQ<rHM
z&V)cCO3yDOUFY5aPoqTEL2tk1V{wEf_^RBp@Y7)LD3`-M*EjYb0;&}8mG}86%-_Sf
z)`Z@73Fs~sCn{j}&{Br`cn%WS6FMlaUZ>GC(V$|-sFZi(Csae03510Aj*;i(F4avU
zW!GUBJv`pFXIZK6>&4E7>cBdFIX{|xE26`9KlB0{4t8ynCa28mZ+(kWo}FQ)wH2+X
zr-#;sL(4WhxP4Yh5a#;WfD{nq{v4~Q^*IjE$kJ(_myI&+Z8(2qv6+`&rNF%-?Yato
zSUjk(q~}xO(BN%}0;YI|?SE42u!<%V1H#znAS{*L2Iiv3d);x1s(Gi@8p2Er_@1G4
zZ4NU22Fhw^iv!4pvA1_wKS)Kzd7B3eg^L`jJDBpjDGzfPYoO5t3YD>(B!^4u-y3c4
zlER+Sd=J?S7e3l8eBSuq$QKxsAaYT^lDvoP<V&0_d(UmZXuv%G^WVg^JCHPdyC)ui
zBuCKi!_1%KGVc9Ony10Q4Uew{7_Ekz$|@q|EWe7L(nWQ3`vz(PV7uu1b|4gz!&$l&
zwK;o5At}F?6>^91J@wu%`hgpOXg7kGeqW`hg`HtxcCsLNbK=+Ju^>%H?h4Pu&y>Zh
zNsUp0UnR>sSsA!1ZdR7LL_pAfcsZ+%F;!4H`<re;5SXh0fH}(UOOF$MEEjW2`f@ND
z2aH4dyN3v7K@?kRYKq>Ij<Xx7iR+dsy~uOdn`b+`fc$*$!|H`UB0#mojiglMYGFQk
zJzl7$L<8Ykk%)r5UqR)djEITMDBGK+v!lr%fJW3;>1#XRjL+kJ{P?5f_mcJUdAa%{
z#-^<73pQQ*$A}p?Ad9)99Uc8kye}oaUhb^V%Pg=QxFa&ZNl=1M(?z#Qa;XyIFWuIw
zGl&d34novVJ{td`o7N3WIG<C(H^f1OqsX7CxSNH2GdN2A>r6Ys2+t;>@R*-1e?HxU
z=Q|V$R01@2_p)p3D#f+ydMys!eOJ8dX+BfymF0rs6o&2cp@5r<NAKY}un1DJhK*M|
zGX9}tc~?XSx#%8;G4TLPY3jim+!uwoMXKa}|F^5ADo4DI<2geDO)?{xZL%QSD8OE;
z7$%Z=<u^;pg=KG6-j}WIIZBbLzQN~7Aj(TE#wk;A$8Ghx+TgKCYS0kyga~D`P6_ce
zJ@UP_Ll{X-W;yV}WpEhG2*d7Cl2bh@=i}4n^RJZO>c<{|kRzdQrd8S)sKw&Hv#R9j
zsgL2%nwVkSRfg@LTkhhVrYyE~sP?70tQF$6nDYpNgqi91M)HJ88u_y=cVo0JPSW^(
zXEmGqL@wvJU=tPJDRntAs+Pc8lN|{fy%2)Ot*6HC*08lp+XiOhxWH1TFxhF`lZZ5J
zyKj}4sO4AH6-j#!kb0$|y)Y-cP>@Vz8j|9ZAgj9VJF{i(PPLbG|1aaFvcmoH#jE}_
zS)~)5>KbKeSYj+>i<r5?hOjE6c;6}zW)0ie58WRq`P$m`^ZfpGYZx?i4g27n*18Y)
znfsRDgt_xU1*KzIz`^1>O?;xVR<z#!PTGeu6)41ePqaR_3)~xsl_iw$_TTXNWr8?M
zc2#BNssSL{AIn`=UQaELLI#hSOy^iBsb341ng0oL@D0<J_TjkPGuyj1<91(TtE=fR
zF#dM+iFlDCr^24l`e4ESvG_3Vpan~&=Ro7v*VZCF)NrsPsfsv`(vFV9OWGe>^If|7
zE`YK<=C#by@k@(4miVBWR}NbPKikJN285Mx<L^(?dAZrTqYPag(nLMo{Jb&|UU-?$
zMqJ5}u$luGtCA-12y?!0qsJ2BeD<V;95TgP)QIxyy5~9FAvI^GTTr75A#IVPQeJN)
zhXzy{8IH2X*kv<7rl3MyYc#viNU3(jm&g0>WG0;;V5U9zi#T2lB|L|QPw{58$y~X=
z_CTU&^s?)cG^%zVi-19T`R;i`PI8mjha&^)gEPUp@iSdj-R1jn7!gsera}GnmA%k2
zc~jX}2LA%*bI)rSBZIorks7s6eI-EN{}1#)3%{4V#1o@NScn)hD9P^lX}T2aDyEp+
z(fV@hImF>JQJq~6%QauJ$CH+i^^j~|(d7mg?b{ChD`)=B{4_$TBe?T3&62{u7!CPz
zmLij54~(x+^Em@Ds->tPE?NH;x@rZWWs%6?<gq4`IBP^Sb<#~kR^GDJ`Wv_6n29FX
zc8?lTc5FRJ0XYiml8RBMylNZ;B%JerKnR>vr&5Fz0q;zeKA=%NMNzgTanm0JTqf#B
z?GIai+lb{i4q0J@_cU*INW-<Mu{0VhA&^lTaG-3Z&0|*Gxy5n}MC*ujp;Vm%nl4X9
za~6=YlGL3&QVGW<P%Fk^GD;T~7MD@cUL-gZaBBV!&j4(1)2QXHy8eoVLg9(rzyA2^
zF6Q5SI`*fp1yTfaj<0>E(SJo=Ls`5}B-S-Nof6Yt(NU{pA8x%B5$Hwc9iN=03Lf)J
znk)N0^Q7@E-!~DNQtQt-rr+XHYfpi;#e(7XKLSODM*(QgfKHZeQ*TWGo<JDTeYy*V
z@u?`ufTsK`PPRuA<@_up2iS5oQbQGp4t45LQ_WQk3SbH{#dj_3f@ASG#0=BaN}?RA
zQMV#`5R0oLU{sn)yII4?jKC0{L%isOX0MUx&qq(|g_HmQ8}-~cZ^8+BfCi0|riZlM
zR96weC}M$<?6nipWc@~qv>8*~)Mq2XPzjTHT1@*1HVk@e`U^5S8YHK~DJW7}Tt!xf
zKBpmoj%l7Q4l2O%um|AWc16ujAAf8AyWaJm?KfZkBkM0+dYvqAp<5sU%sIY7t<~bQ
z%N3YM9&rd{l)?#j`|gmuz7h{A(=%~zm>sB`drW2da>0>*-oK~;rte()%NNVM^}012
zNTPRMtI=f9Nd{N~lvu;@JM!?oqLsBjcjR)OO1;H?81G*ZJ(&YgKtoi}l>z}wv8Cy*
zt;`o~cD`drp%O1srqTF-0{uRH(oQ|{O`>Vb!HZp|B7Xr^ShtH~r&yoowb#-$x;Unx
zg6#t7VC{S(CEM-hCh!9dCiQ?Yj5iQDz%PM#6k>Lv3LOHmKz4948P}|v>8`1(m8MpN
z-~c}MhxQKwaNf{bR6moV8JFP6u1`q4i5=+FX2=Gu1|tZD2EFH0!+`o42jC<EA{~(q
zXrAqD+0OCQ|7HI>-hKOECHI1*|MAkl=YpT(A9LDWvcNxLfs`UU*FTYN>3VO%lV6Lh
zq~Lkwn8S=0uN$}w{F9@V;5@xf@_E4}`kl~Te+EoEa*5y-ql*w>tlxgP!w<(H3UPP@
zPpr>kM51GM&<LrhAeR!>33a2u6!W@^duJ8;r83O-&~$TpfC^K`kioGLn?H@C?NlhA
zQ~@5+MQDlj5$n{rVvmL(HaIQYqXLn(Laik)5wb-J^G%{AX2Imd7S{2^N5E=1bAh3x
zU1Dm{O%u{q(hGCQ@vrzoX)3+;W=QtOLzy?p?`W82pUjy_g__U8HPiQ!<stfElt7L#
z&3Pi69D~m~GYi%|HE;RZB`eOZSP|A*k>f4!T%O~3a@ji5==}^ZsdIP;5MrZ>oTy7N
zj?>eJ4r&5V8Z~WngXOQf;rUCQ!oK=<z3cLG{j*K`OBVQ2ERX=^OR?IQxK97zbFZuz
zPBc1fr}jEi=?rzZ=VKV7!*&45KqIDgGQasbk;~_TNjSW0y$~gZXk#7l#Lv!vLk2hj
zXtEZ}7#jHcUfWo!yM$%rXxHj34vLz9nMGt$=F<6)*5Vu`ZKJ}r5A14{?1@JZR%=t!
zfa*KYTD3YeUW%~628JiB+=r6Q=ju`EI>_K?1(Z^8mSju?;YjriQJl}w-;z^AAe`_+
zlKryUhbYSt|DruTwu*g24qILuVSPnQF~XCoj7C{^MTBsr%+Qh=t{vy+V5Fu?&RLD1
z8YxYRO%aD71CtvBji5!h8%^s_W9dxKBV0Ob?UU2in3=T>L;)K>;B<Tt6L8DnqYl(F
z!}f@RTxVKSN_mJmr`bC_wp+aX^1Unl+&zVFdfS~};?FRxmMn12ERa%U=gidFQSX`g
z{z?FiC^Zra1OgfE^pz4(4~!_WUeFgH=yqB!|DmkN=M+ma9~nSlT~HIko*0#^fzX-s
z&04e)>lJG-6+z{U5_0$^09IsdWi^giZ;=d)iWwfw$brXm*7VGdMmGaA1AKq@RNqc4
zw5{2s4nx{(mAUszON%apBT0Tyf%_VS<0&ldHoWeLu1q>9ukTq$BR2C$?1-S|gatP@
z=js|{;W#tQ4QCwBTR=oa<?6?7W-Z5hYqgv+%dXKdyk6#TCJXy(2VbSOl!}z%bD0Q)
zno7QlI)lbmOWsF^p-ZiQbsWEHM?Hy;I`dA4$bUGNoO*oJ{qK@E0x~{V$_|+A^y*gF
zdF4=l-y<I?y!KCC-~IEi|Fd)Y6HYrz7Wg78kRq5b!dPGSqMhDN{XP+l`Y0rUEa&U}
z5Md^YgN!1AOtU3(it8d7cWX*>Wf90)O(k=f85rsP8K)YK{Y<Ls4A6x1S$M>>oE&rT
zw(-_KyMvFRfEV7gNJYzN+2BNTVG0-@g>!Q!A0xFQX{zobvuhZpX45Sj<hxNEXO%h2
z`7SIjdSXh&Dl9JU>T$dtP`KWjz$Fj|JgunOm7)je#(mOKu2@X^smRnz7d>z%EfCkP
zgI&C69y5R$eg<}REdL86GJAy}Zp{W+Ehj`ip9@xJ@LHdXA)sf0jiN0!!};|9!Y*mF
z4%0_ljAQ69Hd+9=)|ce=iO1_&Z)(2e07NwKshxpdEe$qNs=KI5+qNr-u*z>OzT|h_
z@MZrl(*}|So+1k*fO(1x`}sE8S?yjybsEP-ky_pCs_g2ur;vif4s!NSiZU|nh(HtS
z9>Mva0S7tCEEg0!sgno1D^bs9D8s??%K1l7%TikFYAp^7Y8Vs%dgpuq!n`}f*mxL0
zDXYC`e?0+@H&_Z_bWY*DqL>aDvBF&0%Ci8;LcxYmOCCnla<JV%{h0L8!W^uuhK-;Y
zU0s&$8t0_Dc%Ols(SKuf5dsKruv99lSgl$c_2e8WvpmIfmuZu8R;OlGsCU}1OcByz
zP9YAZlvqoOk=8%*o5=7#i|D*o6^R3DtHx`#7AdXUrdNJ4CWl9Mv8I)!PNO1&QOn6O
zG%j3f)x<)y<jtVNq~Ug6Nkmiplj?JS{p-%*&p7QWS>Ow{KmwRA*i>Ksf~{u$DpU?}
zD0n}wwM2DfDI|L&Bc~o8YmR5_Ryw<-{+t8J{A{?p*H`*ah$6gh;6?Ya%`>%<tS%FQ
z$iJ7P%TigH!(9KI9ar3@gOh+p^ISNeOeNXD63|pGECQ39eZNJ0fsXygF&ZgnDUQ!{
ztqj(V0|Fa4@a>hp?H<V4Mx^Xl5AL_}Jl*w5u^~_y7;7O77*bN<JO!xUv#BUS&69vp
zZyxpJ*@hKXvFWOMW;kvU3<OYmk{%eqlBW*j;G<~A&5)56&X{>UPa+ExGjItRwve`3
zixfTbi4k2qVqeNHQlxfk^YKPuEw$zmpkJ;tcqG;w$Jv7|ms8>2#|%VaQT07RCddzs
zSY_LF)@ZihGI;Z^e9f2t3r$-{7C5^WNC0zo?fDOGt=BDVcL4DsG|pMi5op$m#{!*L
zNcLzZi)tcZ1klj>>vy$|43J@r9C2iUg~%pEJ%RltPqvl;^3Q`pb1dXXXBd_ZX)dQ7
zI}mp8p`lAZd?t_-m?+|4G9?ccs=e+Yz!GbWf_`(BkEVKn3j0}Xm<U+~7+}4njtw#L
zcq5VBb`WQveFDdSlEOAo4Ye167YU{{C++o>SpV>hRxJE?)yqN?WKP4{n4gmaXbLmS
zC^Hk)D6;W>7M-+!CeWJwp~lRZPV*v9_S!FzY#0H^C2j&m+#mjVc%8A;90VM!g1tk-
zXQXS#RNNZ-E~f6n{XiBiYJa*GVLN%v3X)<o3I>6An2k+<t}W&4JaRQf)w)J)x!e8A
zq3hms$2t5tr(Gor{G%2~MgD)(L}zh<PGKYM;9j@SQDcz)lvqffr(txla_@2~Vv>c%
zOX0usr@Tia^joSLtYZoePo5~KkgPAZ%aO+0VmT~-8*Y!o>*!r$L~tg(vmSu5+iFEa
z*=y}4)PI5$jJ}HVUA!g}BQt&D`dz{t?z*Xsl$MeAJl!JoMbjdusu?r&GRskym|5xC
z1PrQHzG<^H`eArXw-<euiXxQ0#yWdtJQl*&j>jwGfm-aEm%L_sVt2LYSFMcJSeXc^
z$hO;q0OROzltK%2m$kxwOmD?9{~7FFiwI-<<jbA8Cp*KcbJMPiKC<_+cf3vl@ip5E
zlp+DcBNx3F5b0dPciNcq;`C!-(x;j~bI`N(m1jLE%1FbkEjF6icE!!M*qQo^k?Viu
ziuup}2ymYMk_FDc1rorVqifK<|M@$*q;6cg-Gvh)MLCNMe1g4hQ?t8u0g)tN#3<uV
zHUu4^KQp_7kH>x{H5O7{{#cW0iMf}|DZ8VK5lwWZJ|xUP&mx&DSEF@fM3tqdluB}H
zvLYC*6KgI`P(Tu&&!jS(SA9ct<R!)k&%vNtU?{Y}MGb{Ml&eqa1ZQ39+SXB;F3}`g
zUR^c>SZqUM(iRp^(_oJ<r55!O1fdH)xL$c#w?+gz8AbIPo}dfodQg~2x4{vHA{=bq
zqZdV$@*;gFzD;cF_#BXC-LF9O<_wU1=T8Ehcp<dbx+pT3H-6wz&3e#@(TpE*wu~-Q
z*p?rLr`BXsiJxXr7l7DDGAuW-0{t<XB@-EI-7Zs3t6QwFZOA&4he<;=Hq7@H-p=M;
zaE^Y~X*bCN|DXjDz?`EWNWatBV?88Apr8s|T&7Fvy<TkBExqp2<xJvTx!kBb^B9@<
z@UW2e_q*6NU9!xRWuOpJSs_|c@gsA@a$tetyXF}^)t1S-X#Ff&IRiMKTh9r^flw0>
z@X23#1>H#tfixgCnVh?ZYH<t#6#5B!q{oWO96Jq>a+Ioz^gC@UVS^DgxVF*2I8TB8
zbZ61_RvNa_Ua*-aQA||=mMZjTV4&yNc)6D60)k@NODr!>Ezvj91!Ou>DE5ZTwsnus
zSbmh0+d!R~4HGV1Of&od0_8eb&Hth}WmZ^Uj|~~*cu<&aVl65R>wDB0L1(R(Df_KO
zj5!n0$eb~)ro&hZl0wZzL(_7k5xX>PmIenoH*5<_tpEjDvu#QnHd&=mv&K=~3>5Cj
zU-_or>plLX-*S$A+G#h*0)MXs62P3J9|sZ06?qE$a;hQ4f#;wQuCc~GzeOnzuYD+`
zf|sF%+#?DfSDZOlxume1=l0bE7$b{t8d>vW7R4xS6>1^Dr$sOA`TyY1vgS*l37HWB
zv`&Dw2q4+=f#K%&*PbHN_aTxAqnrgaye4uf5T!I1VU;c72mzMf0#OY5W1*N1sCm!<
z08kq#qQy1Ao%t@B6q^+1+nqT(czmB#ub#Kn>bRU{h+5qF3k5<ADh4sZov$md3pm=6
z+(+#=M?<F7YgRy#rHF%8hrMbtM)!OHx^G{jj<aohWEksP5ggq9Zl`tjbC{PobuRI+
z{QDTWgs;N)ZmVGZwRPfe?#y=A{RpvPPXOu*hmGs^QDkO~WOUkdr~?cS@)^U@VB7T|
z0CW`LjO^fkrD(mQ53;|7Hx69+tM6QX{D=Se9R0l0ZjuE)uLTmooTDEGck_D+OfS^f
zTp#MF20K)O`$RNBDvSR}<0|T91<83yEQXqmGrUxXiG3LHCOAYRa<-mAQmrOSk2P6E
z;X~%l!d-R9{eNzX%<DKAxaf6O)OW`lqMYC?gY)&M#rt*yUfQ>!mJs~}-U59divY(0
zJs+n!R-9ucoM9+b>pXOvAu=neVUko?12Edao%!COqOJC4?fQX5n?WLFPBD`j4;@eh
zB#LH&!w<(9q8qoNM*$&%2sYJzc%l!b!-3`tLr|D0j3d#~pPwXMM$;n7$-O$m-qe>O
z!OXBCkH`-5cd}eeXRTv{1(sPvx)4@|_wnL{@6UjaM=XADS(+_GIcwXAOg_KC(6Okg
zN9TxaOi%qWbpA7q&3Y&1tvonJ`j51qY%1M0nr?~%8?Cez>M}soSUvDZV>kT9{j(2#
z&qL?vC!ThbEbw<MkO1Z!{V3Y2wy{X`q9VU*S&2F;mQ<W|B2Vo#7_rDQ$#(|~n(N$e
zX3i@*vdmj4`pGK44i0U4Rg`X%jJ%L6++{}`I+NbYm}qjsUI)seANSnxt~&#(a3}83
zir=LNa0w~57(GO;g7=vasdVRY$<rMxBg@}sh?uLb_@9R^XMC|zK%{$yhvAA5%}g!R
z7Xvl<s(Rm!S9|uTX5M=g7s11wb^w>)%mt1CbR3)=4|T%YuvE+<;Hci?(Ew3R9-x6;
z)`JMpom;TNFk=FS>ely&MkAVqFTu_s5{XBy=j>)cwWjQpGPL*oo>~5LQey(>?7K+3
z@Mz9YMkbMcBa6(Lc|?@1!W_V@N39A2SsnpS>y)j1oDngnmN~`I$hMOW=#Qyyv)mq1
zXOlIXJ-zRD*}(1R=qH|blPr(`=6wAI+AI0Zbrr|edm%=Uiv0o`rw5688>T%fa_MFH
zeAmGu?csO4s@IST@s4)6omiL1nl*eGrLcnb7s`<iFo7{9rxzTp7m__<2@X4RVg5M|
zPa0U!gU9t^-NtFj(Fyvp0hD6)$8k~GVL56qo!KsIv4XV$i@xT*2$Qtg%&|Jia0Z-0
zR|R<L3q>Nj@r|15S+SXYE4KgMyREwCfYrC$YzuO{4wS6tho^Ho6Defx{rKF+hp54i
zElTC93*MtU>6Zz>wSVMXdP_~~E>RmJs>!1tRvI7U2B2hJuX8|B&_>kk>6?s{<p2vI
z$kDZMYH<YdEOPRQFCJ_5qsyqtzOfL!ga{{qqc5)Q3Ah1P@?svz_%+ldGoNnC@zc%U
z)5zvkI#*HoEacnhW1^A}rG|2KgfX{X2S62XE04Y9H(Q^6!*`vp-+kKe)7t_mf;mUm
zzt`xE^ouxLNz!MkxpEj911zmA(m-B%k)v`qFK41qM?iol(@15dvHWgc4m!+_&kF8=
zW)=XZ-$C!JO*LKg=gg@)n5?^F_J~B&7Q^kg-(35$nP(2^uuQ6D{T;SDL@gT~YcCG|
z>Jz!-@zuF`fM)@Uus1box#)<at=(bzI$>&xJawnR88a<%({Ryl+&*ac@4v@RzVnZ*
z@Zu3GZoDdpLB<qP&~gi?l>wLcs5k({j`tW!`1yr`Pka`DvscOBBTm_abylDblLKUW
z^qaI#%#gA}LkxB20yAPXb4K*w9+Y68W}ShG&PAZ=wp$N&5tp-2CgQr+!3309k5N1q
z|9TCj!?lnQr*8#fLP1THdJy({4^5lskaS|d%kfR@q_=1u^#Xw-=cSo88l<Kkoyw@?
zHeF?{na92}_sl>3aR2Ua_``GbQ%}1|7Fe@D0+@63V?gqQWWQ2k3NUaW(yEgBA$6E=
zmrb<cZY9u&eUz@g^WuL1mr#re-Qya86_(ceAzc+3A2pLiy%97$M%I{?5^#ClV_lPF
zaE?j?g-FEpu%d7C<8Xk<(q(!bNdarkh^$S~#m59IB9rpX3Oz3Xjcb7kNW)33m2z~r
z-k|6J6?T_bh{O4`=!aRw(T~CYdhVVf8+;CWTTGbz#N;XKkB%aXFB2+y3Sv6WH+JQN
z_&OeZsMW-sdec!n2JKL6JnH`at|S0B2Ho{5S^;o!OQ1h8GhK$7br(?tKtkX5Cl(jc
zPN7B$4Ypffs>dENh~*X0(?KGlNx+E!91+{x(rycZjE*?~_!@9$zr%Q)MEb}xw!o!!
z@-&c&^r0Rq7;(KxDJ5VFYA=P2W27UOu{$7BAK;WYIUnfEe`{#GjbDZ>9{Pj)Gym_$
zx_5u$fphe8PrFGLa0?`WIafb~R&UTF6@iIQJ`{lHjyN<*`n^1W;PUKpRnE?00o}c7
zbh1=|-C0I{mm(OpmGgQ_WOm78wEz~z{N^AOgepNugM~hq$bJfmnCJt;opp#LVw4b_
zqx(m#=Qrp)y8=OQ=7CGVr#|C_@09uzrvPlr2|xud0rx`cOb#`213XQr&!fYY6IBFK
z@w@LniptAYYoS8SoR>QD=|^CI&B)<vMhebWXDh($W#AGcDL-s6*<5Q(5sso3|I`mA
z@1PSVQ+g3yVDGeWq7<C9D=$}Bh9*%%W~L8W!K1zg+msW}QB<cnG>7(?Ni_gXFg?(;
z$*ip8W307=eTC9lxTXPjLgbfWPypZn{7$~kA|1U``f#N|N>4Wrt5#TAw)W8}Y&~d2
zhzVV#@TeK($A%F5T%uQonhdjR`lQg5f0QzJu8fH2hHHor?|&bg-+Qip?rA^C0tsNw
z;ctL`kh%s~c@&}OB^z!fknkQCIgwDX_k6yKJPKq~<OXEKo>Dr^qZln9&!rQPV^Zxw
zcg1|5sfw$i-)3DyMRsq!?u-M#tkG|MThZ-hkxD$4P$vkr7@spAHN5;;Kmy262B4M#
zP>C%jfN?Mp3Iw3>_c79Nm~-&ae*&9Kpp$1iWjTH#w=o)K=I{N8ov7c0gS^(NL@zB0
z^G=X}vg>TRn5Oe>_`U>Uu@A@lVq_ck25`CK_wY&e6xmoXB{S-J1oE{H6GL)Fo$(UU
z?5mH5^jV%h9jOWAG^d5MRUfBsvl^4k`bwk0QZs<2h)hjh(GctAz2+0tmqX)Z1iM&o
z@qx;sc6+){L8$Bo^Nx*rWCS6k2;)t)A<@4398+uZG(A0l9Q*3hvD%XzL<yH^hfpaf
ztWxt@rsl;g87WNjn-QJas>7Dsa?PIp&2RZryYCH281o$dyMMaxDn&5o=oi53_-fw^
z`bxRrnUu*;A7qSVJXPc!^1<n=Cq>ato;fFBuvJFK&58g5ric*kpygQcjX1v?cl;hP
zNY>u(qGReNdk**I5nO~T;jVkFPLSC|chdp|PoV`z=+RD8BX*JkuRY}l0}N@b5XI<o
z2OhSmXV?1*7^QSt1~#7Rir;BS3PUVeKFopHTw1kP?rhoP6K3z^xCLH{2)e6E|0xO-
zg<GCg;1a=AQGgcHoxeUK)zw$TA-65u2;An-%jy;gfWH>y$euErr`}w9#-lOO#Wng^
zL&b7!O`c?-4hIY12o!7yXWt{b$Z9>R*N{>Z*yKgS2BeHf0rEf_;4dih(DfE>i&(*l
z7j*#|E$H10T?42JY`1&zI3$D>)~6S!G)Bs6JHHpwV@gGGjakUI-6ws9c!3W(lL5M{
z1Huqp)i`!XLF!1G%r-B5ZSGlr{`%f$Ui;Q_^s`U9d74`w0nEAj0kpazn1&dEUKsBS
z(cNuiG`Y)*)Zx%)f@AQgMJce5%2B-_bnwT%jL1E9$rCwesqT`a_bz$&$LeZ#ZyF2C
zF+y=_bNEbu4jM5c3Zwyj7$OU=-2|*!7mWy50(dC9<@#z4EIhJj2bOi~xHx2pI<htz
zmtYe%2z<P!MZpXEMFdkaWz|FbY{$eyHn!tB>yQ#!S%wI~zIw=}C{3j)kLqOil${@E
zI4(Z}MQ|&S9d!3k^ph{rL*f&^^nIvwdCJl2SAj<Db9g)psfa|2YbH%pU2v9{fX1JA
zkRnqm%FmzZ2EfsbAB}rf9g0pA7;-93iFVF5)%Qf?(Ib6XkUBx6QrA!vSOE|^0vT#;
zz4;|f$hj4$_h_;Mvzp68V~kB6Az<YC3)~p6)(1RQYty;*^?TBJx>OD_{&iljz%Uc4
z@?Q4UFt&Vc8@4$0y90N;<-N-veZ%~@`st_rJbf*Y0OlNA`~Gh~t2lFNb-<$&-5dih
zat7|RV!wqvq=J>u3lki(QfJ<|C<hg8yJNE@cfek-*6PGPa{h{jD3a%)1_$C<)|45X
zs;879b_5seC?XUJsUim!)ng(k_}AgRD+dcKv?)a_s=367#0$^vWc6T6nqQ*VYtajz
zMX>R0M9>NBF9DZ;M>QH=tM(!08MPOwFxQTquxp0jZ8r^Vv3m!u1DLmQt}wbph*)05
znUWqNMb$+SQ=kyqbHe#jbnu8!&fh<xwj#+DvGJtR98QdfM=k1?(dntEu4@9Q;Iute
zsR#wZPCs|qLm=gJKdqn(Afd?+fA)k@L=#?j@<>lW5BLb|0E7<nU3AWEZ;;f%_HCt4
z?Zj^`7)*ds3i&$c6(0HW1~8;)O%$keq8<WhlIfQ+fqG$3y5w5W&*Q%tKufe{2bHSt
zyVoF%rzniNmE?o+1?n~fTZWbwPy7l8{2k}ur=NE5G`Bzkm~-(1Xm*O-W``SM?r@dq
zQXRo-_R9aS1DWX1!*|qVy#FGmuY7#2{8MPo)ID-u&P0($bn;!Xpzta3fnMYBb%idf
z12~q1|9H_kfJ^r0fFq}`KgSw5SfOEX#JUT15YRGN<RZToQ(+D$VJysAdMw=C^YCa#
zU>TMVjA|9T6@BP)JxB-YE$7ry>WoMzPb1`=H%;5Ew;!;#&!4gnpV(xNw)fhJ@&o}O
zcjgQw%NM!dr_kLMdrFav&MHpZ%wdQUPPehLz-a(DwNZ={e)5414qMF8xa^1Oc5*b9
z{SfICe(0^I;3Ba=_FijxRSGgQShTGeaiM(X^kQv~F%4`Q08phzHl+|B(v7lnU<G>A
zBv4SaL4?!S_yHX!sWzoO)fZD>+ltsc>qWZG!HlT2FJOi>=*&ZjVd~~W(lSIz-3En!
zB0V*jD)X1CZRS6V-(G&r+x~C!ZLfL%+7BSTNEUbsERX=^DKP5a*(BhSYcYXc4nTl{
zzi`hLvR!L+m+Y=yGBTb{lH>L0CEhjr;;N|xfbf)#925~zu7zV+B)_jd6uB&Sl1i(|
zO$D$CC=(b>4ouM}W`L62(1-Mx{9ynE$0$!@@enY04JLf%$G8quoI$3~WAQ&slnmQ(
z5Q@!`;W;>3-!T8zO}+eVsgATiPO}aYAN|C`cH^r@>|6Kr?8TdR+Qajg*~ca)?B3H8
zc4U6UrWXdRQ5+&IG=@{vlo=ul6?0@be%+<QoDI|#Y;KN}m5!y(P9!p_##M!0&g{z!
zp=+UZTR?~&DCD;Hlq2@ldB_$zZa0b}WxRj^!Nte(&!gR)oS`U3i^ephu1mNiQD;ua
ztna;APg+jj6K#3wQ(&RAA*r_>HJTiC97VcLSkQS8t@Qa^5t7nzeed1j4u+IizobZp
zXpe@>d`-fb)PxL?&&mdCP8|Q^;U9VDs~3Ldl^;JB{{+%Lo}Ly+0CO&`dlL_#qHI>*
z$nyB|Mb0b>Koj~-f|KzSnjD;>6fH03D8Hnr#2r~+LHQ<?2~lbcVM(7!AETBGoE(T^
zicPL5+;s;a3a<m8=nmN7$D<Z)Jl0M0+9R0wU5q%~w`RbhX9pUAp`KYbtHZ>!*m_1;
z7}yohH||dR>(JFN|5~FjCEp;%IJRW_|LTBU_N|OrE<a&ckKJW2L~{S+-fQgM$!+$@
z!((>ep{gC79I@5fZfjL0uvrs@=4IQEpGBp4%65(y?V%@@?EX`zG<S9pL5yggz4sR>
zedL;Oj=JF*X#bvea+Lw<9V?tzX&?Zi%;HU+(LVk%L_Q8KffaFvK%wiZPjnq!pfqeF
z^c-m;S|?rw)z|b^8`p0@Hwa=5QuNoKjwO&3RM=NI3D{`6>eJC6I7tyH9VfjzMaPoL
z5z`Ft>8+q&R<G0X&$8Hd<k?qj%x#$Y;KZ-I_s*#wdC4c^Pa-`f3!GI8#D7+2b(?8p
zPqIM&=dP)qKDD&Ev470_D2zrz+Ag`9#%a`ePfFc0D7x_sH=k==<3Zvi-Z`&DJP{(?
zM}b5uq+#Z8h|F6-EFgeJ#%S`p;CMjU^Sa~9z$SC9zOo{f=xjrT5kAXwG(#j3!6n|r
zXEhi1uj@evpvfYij6oKZq~)0v4Y6^Lj&v7K>Q56K>FO`2%O9hhobB7gwY=T&rmL;H
z`KWce%cR86Eu)^$>yKEou)_||Znj5HZ?=a{kJ+jAgbfatZO_EA?Wi8N;r3yxme4rs
z?66<|=qCHf+?CcCxD1N#ASW?E8VQD#wwD!|6frm+lU{?-);T!KEE&$JHsbXg2a$mE
z0T|@aRe1NwXgF^N3k_R~R02<X1}nR5+^(V>>sb+|A}LXpwN=jFqek9D_L)lq`6gu-
zMJnu1<B~b%zBk|o;L4#>*4OYcj#>RYIY<b7rB(TVLF9nt8TcXq5OO%(iFqr0B|@ZI
z`!=w&&={KTzy8>_f8}4Nf4Ip4XWIfPf;rn3{e0VN^_%@>r-h?I@C1je6bI3XI@kk?
zOl&PV-Hgu5=c-4?NGDq+0}4-*d9)F3g2gsc^iqI87IOOWizfj~mf6V;(eK&=cKNg4
zPwUw~%OaU@x1KSyT;<t&PE`00;IJ0S`0FgniJwQn$*9d)B;=7#q(OUa#|N=#yDGTz
z3@}MLW&|2;X{uq5zjx8D{RWEP=qu+LBu2<at4ae9n6s-#9=2=72JD|)F^qp5#OLFx
z3m7FgN4g9)6*Z7fVXIYN@_cK)d)4kby<jV)&4BzS%til5hk8YH#y!$YUdoXA^1?D*
z^V5I@%V>}dHv!Jd6Prxb<$xs|bBK0=83O<X+r~6zR@~B8o|+6s4-K-e+Ys<dz@+um
zSnD7iBN5*+AOWzrjYyZtcB-v-Q0397A{p*ZJY`9$%p)1qX{sD9dl?*ZssG%<wAuES
zbt?>B<M@^b>eX^@>8~#TlaKtw{;z+*FQ1El4rw1x9}A=i<{Vw?$v2KqZ_G_^P?SP_
z+M|{n5sY5<RrHnf(sv>N2=*1pIKYHdiKo!K$s;&_Z%*WGo)voYyhgcLqlg3Q#79^-
zq+OyTazIjQF8IHgripjs*;{?St_~9@&2iCKiwV6V0tA-LQeqA!>pqLMn&5;pFcOez
zx%gSOKJR-DvP1zm)}H2~c=Kk?jx6Wxp`nVs>_1)wk!(LscG<@rc=x@KSoIQ59R+X*
z(S$&ow2>#fh<4EKS}k8@A31!5{o#jlHn8(~_JREkJJR1v)VPUAhE!5@fV+Mu!Z`3e
z4U4TK$8gBHRuj%%%b`z}gHX`VYjm;Z^$<x&eF4Cu&nZL=x`=+^9F=24qr(Z;?sJF~
zp7s<7#Hc8<ziK|t9u%l39Tx8tR6fE+p0-oXg?_4nH>Qe7C%Y?>;qxX9m(rqB9}cO#
z9Ka*ZwHy&le`TMQZ{m3G*ko<gs{7cnL}WHX6YHja{(nC7n&*AbIr=A&cJnl_K#E|_
z(M2wFddv8j4eqF&ITjqf*GW`|kw5agR-C4C=z1NLVOU;B5<$X2Lkr7Ug+|35^%u$x
zq2>{=Ax1Irh38nJ-;1{tq0}Eg&+x@J8-T;BGhXms2QD!!=CzZxyY~n*+P8y+-pAK#
zwHFrldQA46oT1;chjg5%&x(lTkcBF;k+ZhcB5Af2M|t2#+wT3d1-tG0aEyfmWB`j+
zRF8?PhvJB3L!2w^jO<2t_@QAl5EmMJ4&$u-sD0(OmTi1S(e_?nwZFZ7(0=^?9kB!R
zw}45*3`AoH8Pu`v@kxX)iGYY;O3;S;b(}o<YW*e=M0FH~7wI)RWkZTebYy@Gr=BZ9
zNiG9Yp63kb=B9TuE0q=OeWnVgWg_V!)@EE*#@T@ZFeF<eAk7+W{r1AU^!biKZ9Vnh
zkQbGuw2U;%@+E)-Fj3Eq50T5ENhWbj2o7BotMiw`z=KtnlO7vUk$4%iOv-F7FMao|
z@A~+b8w%~O{XZ|cQ&%tjB@2967Dxc|Wtr~_U%A-K&E$Leozl+=BXQ;Y<w>IhqVA&J
z4{yTE3g+I>7krbJ%Wh=##+{VX?QtZrH#kOs2~^v#?W}mc7S&|?jIPcdUItiMdM(~;
zuK`H}nKg&Gt}YWIkx)|z(k;QUX8kT_Kqqcw?Yjsv&Pvna>iwDZvcps@L?q(N>t*}I
z-yF2PFS^P$-ZBhlVj+NxQj9ownaD9qH~uld*b}QtWQZ89l^9uF$UR}t+&e_K`w4s5
z)?K#pHP_is{Kfq?S$w8PI|Zff8pnBPS;9(1Rx8suaoAc)J<T&w(jy(Z<a2z6Af|v!
zX}ca|fYR_r>y(B$7N>f7EhdPQX2>okgsx{8Cf8S6zK_aQRHQqKsM_G)hp`K;DMr+!
zh$+z26)kD=0A8DDhF%#pl6KXcDOnnk7ntN(fWF>okxGQ~T|#Q6J%B2}6=c{Z_}<AP
zK~_f*a+hr`F&Ah5^2%#ARVshV5x(vV|8=J&lLbD%1rorVqhAJna`QpnJ=9#B{)&@`
z#H2Gh_pCrJ2jWb!81bxMHbxxYgQBKQX0CWAeJmoMNt0zrA%_<B>%L7+dEI$Cdp54?
zjHv9(?Cv{86#lVNa{`2nX%^FDUi8mWW^0j4Jf;Xpe@77@11gVX0>G?MwIiihyDY$%
z+ED7)=ycCM^cyGb)xWsSieoex0@iKG!-xps8`mbpPLng2Q<w2Vq~xdKK7JL_6>==x
z>9-c<>?N1wZU3D&*sp!!pv_nJTDOL3@>M&C9Js^RXyp#|8)>`IZzIg;(coEV=B?7E
zSl%IODDtdmi$_5+J=*0kb&nXjsIH>p2$C&|OmrZChqLIUey#6?ii&z1H5AeXLlBAO
zh|qHApDAUk`jE#zS_YCsC>8-aZG=EO9G6mpP?NiWp;MS8#;N%{i&JDSXDv4lb8hP>
z5gtS7y#5t>k)&tly+dqkp7Xzde)hHBd(U0}>(`%gYih<kNB-eb-Aw|Rb97~ChMQM5
zC&nIuf*KvRAHrJGA+P1gL;%v&i6G;VjIWRtkXc_h!xChqPN7J~9Y(xc4==R1KFd;N
z@nqMh&@!mWf+=q#j35&#&ufGN2LK0~=&fVa5hgM6`R4<c7|~<}@c>TifXBBbXP0d^
z9=05y9BJ&`se#UWNR_?!L$mgow{5iNe%of8<_W*M?|JuTv|(gzg|m-p$m%0;yHT^?
zZe9{F;k<e_Jb%A^!*v_&(?^Q-u1Qpht3)8wd(`-t8&|}DhBJ{%Tj@Pr4?v{P+?Y;x
zg(wI(n@8OkiT@(ZjS|~iSpbx(kP#TqtXhrs<8>Mxmoq5hxjT?62H`tTooNJsOaUB4
zDSw$WG<9-*fJlyr$&p4R22zP)iAl2GRiK*?5|*c}SR;C>j=R!wpefS}qmjxJ9R8Jn
zYLy>~HYvRG!F1uKCvvl$fHFB)wfj!aefzH;_{53t-SeD(bB_Lrq}^PC7DxbdjxH)x
z(P?i&3^&^0Mpe#4k%j!I9IdFS@|7y~s|QA1;W3g44I1Gl*#G1-6}<p3c^+%jVF!;0
zF7BWIF1VceIzyE+MhI(W=^Q!4=#2HTJI7cbh{y0bbp|iw>l{NgvYz6+_I?dqf-TmT
z5twBuHmx2NoBa-(9hU&)af;Hr@Y(PC-TgLx<sQ5IS){6L32iSa5UU;&{G>-STFbX7
z8|V&Qk~#w0&@}0t`7BidaJF_Hu<yEU#2)+W{dS0f&GhhlR3Rl~m-QEW0`ftT`6{_V
zVv^;hATc%!_)JhhpY2%d)HJL&)inTuB><+n9p}G+J&^l!5RD+{^?$a{b&<}Qz$e#Z
zJ<@kMc<WtKWiH&WOCVKeMLYZ``ifE*#ZaLB(*cPTh2%6}NZZ!UiWHr>R^8Q8q{P_P
z-~c2v&H|b1{tmWy8Do}506%XYRkWnvp5iMupt_gNlZ_SiyCUuPNO!>|kF5OkM-Tn4
zl{<ES$G<;E|4h<uE-?#a?&=adtqbpTO1bih9))7lFfJkvrL^=cfbfTA*Nc4{U#r_+
z?|r@s->I3>6#xV#dgikp8dbgyDI2k9gQ_vlQE*Z>sFIP9oJbI%<a>Rl_iNf=;d8J5
z_-A^=@4Xv;{X^+79Fo_F1Zo15@bGk<XLf`R!Ld6wt&3Jdtwo^1_s%2>kxS^Ei4jY@
z!w;Y9O$=iAr516Ho9T(kpXk~j|EJ@&bevRKpSpv_1dy(B85R$Fv@O5?4`3S)Bp($6
zdK`%4?&)&zCdEQ)(rzDq)V}`u1*>(A0`5vpQP&U^xU1Y0LXkQiQdLFhy*YqJqZ9hb
z_%oWt`r9i+2gBCg4LA(WLlAhJz7vQD%SX9Zlh$pWBHEm@LSxQ~tD;%YV>_5|T&O+<
z&vzI3Eu4m^0fku8NEMyX*P`Z0ksfk6ND9oN>33$>)@ja*`csbqW6P5$H=Ll>GRrA;
ztTH;pfH7(<<w>F_z8jmj;vRlDR{)k1JbbvFA}Il;L{wwQDZMoH$nYcS0cIiwG{hRI
zKWe-3r+?|tu^)cnh4(Ki9nr;LffT`<qpMV~;-M}gj(Ii0xzpqydJGgoBb}VCsJFcK
zw1=K~#AnYGtwk_$;@p)BbOa7An;#;g5FUkR-@N{lJ>+=xd3+z-cU&bdug`*0j$it7
zOzZ{Ut*u0$2=C8yxa%hP47dcKk+WC5LCdXsL@@B>eY<C(p1>%J$LX~m-EG4brUtPA
zTJ4@{!9MZes=f8MC+(YmgnpLl6CT+WBzPG*7-RY|9II+I`p(C)kpt4*f9OC7k&M*!
z2UZ`mZ@d->n#r=gXL`iilAm#*N*p}@lR&0=VFa*wQb=8j?*cSMIT{jI7y*Q;L@Yx@
zHz!F~O+R6!22o9(<es|eVWSmAZRR@n0JTmLk^ps-JPy4?w9sh+5}=i`YP~!l=HN_}
zrGh@R>^yxxd7OT`F=OQ(lPf{IUVhPRm}sU9L7+Hb%{pLP;b_q^D~ymjta8kQ9RDy;
zo3zWcO+O!>k`tmh5g~MOB%-NCSCI%bv9n9(e`&u}QBNp!w!dxV<Qtz?-}2^%&(S}Z
zw3|!B0tsNw(S_`^Y2zp5v8G9t6>HtO@{ipWig{M8CDvZTl*Ir#8Krqm@<r?)MJ<Yi
zG*!}5Zvhq7uwe~Y;z`Dux4-iod|cAO#bIL|syI@Gjb#oyiv~gl>I}FD3_J}a0Ez3w
z+REpq$pj(+bk?)WLeOyrgtD|)=spaPr(>vMi!q$})^gXj619BtZKrI{l>_#|uieTZ
z&l5Q7NLO}{34j~0R#PnFXF5YyU)E?ZjID-5Y*ETB+V<W(_C3!Wu*d)AVcTyv_>6mX
zvHEyw9DSelx+_g9)rPD$K(wMfna-0w6hAG^e4$Jt08+%zZNMyBnpaP#m0MLS(M!|i
zCL+&FnGz!e)SnZO1g)?Fs>&Qgp0G3D9p#AW>5+|{w!O->0k;a&YbEdir2cr#3KOK?
z2GuL0eilU^vLgV(>&_7AFM&-_{WALf#3G&qR~0RK4G;^^Z`6h6+-Lrc^z2mE(cGd!
ze%b1!Me0h25T7_XYW=I;(K_+Q+sj+uJavx#$)w#}0v1RBbB-<||E`buKWs%QVsS-3
zoZV^*c+~Eq2156R(vlMB$S+84MWd7z&3Grd9FHOw9OtU~9QYzjG07cgIHBd%B9=1|
zlRMv-=Fb2m1SeTfhE~iDFl-|@YAqq37^04dJ!gO=0~rUJ=sO*J))nz%@jj5O)}_TV
zgJe;UzE;g7Mq@qByFer|j58m_(Qd^?Gu*(x{eK;`ZM#*E8KcfU$M_Q!%O$<yu}d^S
z06y*0vC%zosO>01P&=lKRwVs&JE{Qx-;I~sf4J`nn=9@Sdc<f=E6Wk921EL+23V8<
z10?wURHV$HYXibaN#91;9)P2NU|obZ)d<^!Akd~kvLNN>qCi7TuPSLbN&TxBPb7lF
z?<_AsCWu;dT=)DC@$V??Hh?Q%=_9O6YK@vnVTgUHM}|9%9I6IB>Op-`YQj_=RWkjq
z$Q{nr(_<nK_~ivmSl$(TT*)+~N&O~=ZHn4lWuRH!n!KGw4X3xaY1h7$HD4Gukp7Yd
z{&5Q=fceL5bXJz=^>RmBIF=H*ZXa>%zRyF4klKnh2d`5^DH#VBPm}Qv-wXb_^Ndl4
zq5x_*y;aW*orxUc-Ts<W$)cFBt92=|bt$?CNb7OQPkKF4WzmkdvF`PmBE6irue9dO
z*9ZU}U1`M_kYvr6p-l3*={5(N0APVGjcfGnhH_yEhdG7=+=?BRNHslnvTc9xtB36e
ze(`b}+tsud6E7=>Ip!#GxdabUhl>6_u+4**NK)|enqJBeD#-vznY;XAW52!rimKgn
z-0Uw-R;*py>2~mj#SpIen1B+h#uAe3D&^_S4Bh$T;+^%ygWa@L%c%kkBQU$Rspmu*
zYZTH+)A_ME3k+tLZYJXOQSVKenok$ePw(*&Xo<u?icZ4?0vPDpcL;h3jU?KsQ}Ui4
zz86tM^?DaqRy4!+ZaK}m2_WM;Q{?7gBUYMhOSzv&$kn$2nIg=&s#1i#6&3?fbEnw;
z$ek-EzV}`At-t>Av+|E3ZRC=-Kq~T|qf0#S!DFYpoepU-Wj$1Q=O3~j>cH1S#d)Q?
zc;w=HFU;$;sK_er`?5U8V!ZA+`zQ{Z9!iz173sA-K3o5O-RrfPElvUm4=uYkI4l<I
zy@($lYs134I3FKt-TS<MQuGo4bKU29FJGL|l;zYtx{1qV>#w^WUWXAldCy36j_|RE
zaL5~Rgj)dL?NFB|A8y#6{MJcZokm*>;m`JR00K|NNr=*E!jh8-2p0g7_fjBs2vf}`
zu)2EOxleQt*!Mqc!Jb!m$m&Ez8snVPv`R&I98j*)q_;%9F{H6-nsO;NSg7fE)Z{e{
z?FwLHta4A057{tUXRI^8yc@($+f1Eo5p8uyWA)J#Qz1wS$Q>A9tuhqo@vb#T+twVS
z?ow(H#L|_I@f>r-Uxzg&f<R9FIzFr?f|RE-St`yFGBsin@dzNZ8<`M}3aB6wpa$SX
z4o_RMEaU+VF)PxbS!pklLeq~@`-R{@x~$PX@v;wm;CtTr=kNOAv2*lKC++4Uwm<@y
zb9nKpx!z-4HP`7MbRg}7hl9u&fZ>MGfoAQy$O;Q%iN!!udMp5nG_ZVLecaaC(64o@
zu{Fk+QXtx|_MdGtvd2CLWCS4b7<H_fB!8xu>}eEjOY5!!9Qip#18dB%;Ku_2Ax0}}
zb($xEPxzdNd(bmLpp;Y?C9Nn#X~M423uwbQ_hH^o;J9}nA$s3?r|qx*Xvr$AZ4j|Y
zGokcSXe<0vj1opaXW2!O4%GPQ57BaY4&*n{dP`QMY4OI|qxM6$PT3VJcU!4B7tu11
z52SI%6f<fe(1B};AOISHh@u?db~pzYr&QS-(N3kNyhfx*M-hh(CTh%Or7xF=aY0L^
zx^&K2A}{nZ1<H9yceySd=~Mw?PmD7c$`lY}jhjI+!jH>m#jri^ugQ=<{P6q!!()9>
zrv0}l0Bu+q5z!$?1vzP~MK)Wj58#vz+g%Ue^NPQ@Z}pS^{!f2_>1F9JS>W%tKmwR^
z`0J?UN}qHmnmK0ygab*4W*j&iZ1h@@OpJ0o(g}~8Q;x$!qL6a(J&$0*!y_YYHv=&T
zApw&fx}z5`odKu-R3wf>VRlNg){l?%6lDh$fkjB61hLDwya2=_jc{Y^sj>K2^Jnq>
z8k0;;SU9pig3(9uv)Bk9opoF`_5zh~u9PKgnCB6k`Znq^TL>9G_?Aig)cZ+8b~js-
z-ju%P)Qd0Xv}m!&<d6f4&w)j(DKWvaE_+2XAWcz)^~MRieaA8SV@}wP?tNq-DflRj
zBp~7-bKC{=mnq;A+2rfgPJE^e`}95&ZQt{H0YqG@Onob%4wNZ01gcy_;4GhrfTax6
zr{_gApG?wat(N9zB+%6eWNlwJEYx_E4vWV0W6|e+5!i>o3Xn5Vk#+u=(LL(X!SXp^
z>4XAZThe1&DDaO{=$HI|&Vf@PBqGXbd&YBz?2}WISHA7R=E3jylizstIs7M-c6E_i
zAOXxdykOm4`QvhEa{ijG*p2-tVeD;)LIU7~*BTofB9k+qlBLtoNb%@JV89JG>%}yg
zfXUNkI28w(2n_2|VP~dI3LxZw*uS1xR?mJc5lp=Bt0*RpJa!3rJ>;kSS?k9#M3w@J
zQH@6}F(tM(KPCdu8qkCr_2~2iki-CH?TgSm6TrtaWxTC3<7EilwqYFP5Fc*C+YWOx
z{>Q(y&z`t{gy?0&W~XFmXkE_Ui)sVflt&wK(gGRm<G`jnd@V!WLD2!}s#0&#UUmJX
zefJGd*!G3{tkR@kMFn=&?Wxeg-F|-_pb;Caf-JV`8tO06!McE{B9VA@>{F3UK+t$X
ziaPXgFp80fejJW5yeAc>+KNtIk%-cK0UTna;;8ON&$3cj76^oJ(T`!xnED;?17QD3
z^##Ci(srnGY3e1ZxgO0{I-cs{QG4#tDcjMX47DBX;w>~9r9i{y*vOKNUAx5|9okrZ
z*F&>!d%<u0&mTXB|BTYEE+PvgfH{X3tCcI>(^mdm4odz^DJ9iW6oo{}u}2@e(e#E%
z9EYdB6q&?GWDRr_MT9vr9W`fql#>Bb*v1-gWdIS=UEX}@Kp~Kb%er3*z!H2^1|$JM
zvd;rx$Pp_th!Kl}MvQ3OcLvA3)>xTEC0WmlgGE^0Z{o8^X5DkfRLlB7LbM~7i2Yc#
zT<+QEK!HwtO+Fx!;Q+ToE6z=K?alxFahp4}g(#$A^Cy{QPXR<uwa1-&#!Pd^zDBNy
z(S|3ObgC5Mc~&1xt#QQu*_9{lhp#zim#qLY3_MfQrRF>5<*=9O++T%hMK6j}j#bEa
z5b!Sh_}cfwYa#<LyvNUdrjNi*^Isf6WqkGVE1jkr1DrdG$X;Vvy9g@IhbY7)di?jR
zD2eST6{lG-J~F}Y0Q_R+b)kgJd?#Vd9NQJp2!K5GCZ?R%s*t={J!UT-e%M~W>8On~
zPP$Tb0Cq(>0-BBu(SS9KGV}82uuX2+W%nJO`w!zk^$Wjr4*w~oU0oy=NC0yVFVx^?
zPaSF3CE+1|fYPm^7|lKr-I#|W8a)TF;J~~tqxUhoi6G-0`&s&myUND0mccDWP|4~m
z0xo|Du(aJQicy*@KC<nEC?!NIJ{&9?c^m;~y?<K+CXaH~gAe;~U#L_{jB+CQtc_dt
z8~9kX&*;wUZngt+Ecoc~2#(E}PYz;oscJy%5yj+e1d1=>^BNLr<2cM+d^Y=d+kWrY
zm>#;s2<3&m&7RVM0z>qAsNUjLjCD*ne~;7eDxx70>QKCBIde?I&xn?|lmFUl4%-ji
ze8~1Lf6~e;bDlcN<HQT3vU>9~9{CicvaIMwGg)FJ;n6|_<B+NfQBH^?G%rRE(Vru5
zC}Qw}ydtR(T`2O<zH*9?vZ9xRRESi(sGo&*ieenZydV~iL;H8&=RZC^8CDq2S}!!8
zs<woH=kf`Ae(3@G?!8ag6?TvaPMkZ{ssb@W6(s>#l^^C<1)yRZ9U4g+n|InuzV;LL
zo&W0B&*49(w5yB20tsNw@kJsf|Jj~s#U$c;IP$K83kMNvCF{W?=bitd9^*op*+WrG
zj5Lb0es@@|trS_Tosh)|KF(8cda!)9j-nw48cmGUGBJYj;=e#4d>{N+h(gv<U;?FB
zg9$0L`0z+3q|4Sw0UntMNaWb${{l5T6#HX(Dt?yvNB8+Ld*lF+HCp<%boZPIK`OPA
zvl<{XQ6ohLeH$_s(M;Diu&<r;pB%bt!QT8|Poo63m;RVOAfw2u=Coz+VE27#ANm2r
zV!zJAwbo#F0Wd1cs~F#qMpuq>+1`WpZ=QG3ZZChrhFi$McwoS}ptjtdK@*L%S-vbG
zxqyyq#rhDc1|U#6YE`K!mT|e304QsKBoOd4TZmE=VW>ziHkwC1>kIY1pO7w-p#zx3
zNJjM*|6b8@c=kUBxu$g&X&@!~(!L7hWrte_?KKmR*^ga$&~Bp5RHF@?dadY4+Yun@
zM<oSoQi4rg3*RO}phV@JO*^c%b?5ikcm32KpW}a0X?GW}1yUc(IlefgBR)>;`Fgr2
zTy~_(BlKp31Mz>$$^wNfOK8kgF;_k!3SWjAfq&#qxY_E_OF>sQ>xGHFq1Pj@uJi%R
zYaOMnJSU~q^u7C#*zTf&Z}0$nK!m^Po_5jqdh+k(8bk3~+lm6m;ebOxu~tV>G!X8l
z1E_?P%HE%eMm#;`VB&6fZ5jDOKjLth;dOijPzY3A-B}kg)-XLHV`#Ch(wK-cy=R9e
zIKVbE-upjx+TL8x**E=*-L^P;0AHH7@r{zwSG^(DAl&9U7WMUqw36<+%1Swjl_g0Y
zq8bThmiqJd{GI!4`^L@ozdyEU?>c#zoi1<Tct)+iu);oZx`T*cDgsr?Nw&-|ETc{9
zBuG*91PB13_b_D_!fPYJ?mBB{rRmsYe@&y*@o5ZlG;1_&j9~O@5Rk=-qHV&i^Ma1s
zS{Fq2*0oS-PLUDH0o5``zhE!l@QD43y+`fp-abeY%OP4}8QC+C6Lg)>Z{r7XYPD*I
z7iscj(>>;N=ZOrv^wV^)OL?ID^{wy!w_CQCC%)>~ul#_9d8fapu>}&qoa2k%?v?%^
zUn+iw&rNq{LKH)kLd}}HWV#K61#$~=6jWA2og+pP-gFrO2jCE^Rqc0D&22Z!+^uV2
zhF&Q$QuCgCkvo6><2>E4@l4k&yEzR&a>k>(|H^>Fzsy`-`22JFFN1pvz?J>UpJi{?
zJ-f5?%_$`nHygiq9~ci|jaimOJ-&nBJ9APOjw=f!WwWswv*oEj`3*h`4sw8_L|9u%
zm)-LrLKhn6zv(;o+Tz>+J3)iurfn4EQAl=HnQv2d3P2=+ve$<qs5(fqT)`P^J))!q
zyQ1E(e|1O8Za>hnKYDD^9$4IEOZ`#n%ngu<8>8EvE1|K=Ym_=UXx$M2Sg!iag%m-<
zqYm}Ia8X1F!#=K&7WeO}%MngJHK@o^hVTNBPF5aXcO<-bAoCED7erBtW?jrvVx3}#
z#eD@m8U9bN3!<Cb`lsyc_DtF9#-FgA3kR)&TD0V4=<D>R(LO+_Q&$g;{(7ZRdvvO9
zM;0}7joT#clFxmLDA%RSsDkSDT`xR#`t)D%@r%y!Kd-dA3(Nw3Pn!PDodu5Fw3T*+
z<<;8ULPbrI4boBtLXQ&vis|2`n>zTKOgc(8zOJP1G9)~^WDMY<sD{^?VtDXS-|C%`
zjg?p3sd)h7^FZXWV<Fd3>rB*=0f^`MvtL1$FpZH-_IZeE;_Y}AQ>+6T2a|Qbq_vmp
z@3i$gdmoGaes>>{$ca?pa6=j``p`AdiR<T-;)?75ebnaQT0{(|X|Ozsx^xS<l{r#C
z3mgRc#C95ko<x}PwpVVmH+<)y%`G0ZTD@SKx6tj+-8TsykLDD?ptj%#;JqY%$F6<J
zxvHS9hod$7)VOBu@r#4DuQg(yK0aXgofx*e=LYSG`c7*OUxl2E6t0)qN9f<l3sh9T
z_o8|@Sv6MXDu7)HNtZbAC*X95K0|I&hKMKc^8J!Nqn0F92gMn>6Ujiv>olSt3RL{a
ziINb&^weVL^>I0w)`CqyL%wzNh<)9zllFq4{Wc**<`vdOWxBF~6rq@7EE<X44tg^E
zbY+kIn|E)pw@%$^tHYNgs5#{FIQ|P!{Z0`nKdN$zwBhvJ-?ab3zkKDn``4BBbfH-w
z0nB;0B9m8_-&R~{y;}V))LWe~<jveV7UVh#azasiOkh%QaKU3ZsIa^P3=fv=;!uxF
zQqQ4HV+$SMIUG(9#tqIgOL2wBCq^3%JP|M=P-IgsvlQA|ekw*X?&+L98`kyAo8Jsa
zCR6MrkMUCYGIN&MYI4x-_xyG{L^<yCLebsdE0e=R?6{02l|?&Yc{M8rr=KT;6DWv*
zJ5PS>iMhTlYBoLMjY|v#o8t~!Bgv*P!X%X6n_s%wzU7~9u+`=hUclcXnf=PL-+hb6
z71Bw8o#kNQQ{Qx)N@?l%#P7mI<7riY5X-Ek^cl*?joyeIT&UV#pO~<BEZ$(pb9?aP
zVrr;ZrAR~RFo{dbj%w)Cc6HGKIwBsIvJnt@YE0KDl;FesmSD30zFLN*$~v}?OZCne
zKqV(Sc3UZ{5FPbsN>!MfQ_fYU<2>ug%UqeK-m+)LUcPbCcG(j)z_8Uk5mt#Pr$<dj
z>DDq@YY5bM6Q?4sehWd*O407g@3wD$bAi#ymosx_hxLaxT5)iU^Qg0Z8cx-3q>;`1
z4oi?B`U^{c-2M+g_D|2lKeM#U3%~*?MRp!8Latl--FBh#YPl=duaeV{-;pPgZ-|L|
z0g2;)z$O;Fv&=~7aF^3i7Nlr1O^UW!7>Cl4lZ+`erB>t|1t8vwBG<1pS*)%29lP%&
z08Y=l%|p}^KqeMN0uuNmH2=kVPqd%kamU<Q7O41*_qu%A8L1NO%3)=#Jn%UMeLqfC
z&gwG}khF6TGNKb!Ux-pn((F=S%t+mA3Sr4M05AqyF9&)zxeaGj4C__xuJ_Q_frI~+
z@4MWVmJiwEk1yNay~DJyEZ~fl>ye`lHL(ztDUB(hlYiH>2<alPPq<^({Q%Tn`eRBU
z{B_iW>X-HGk-I-`vw7~|`-5zcR8kJ-s{%REWiKZs%m^;Jc+M(==B%_D=OO&@6k34Y
zGk~{dTW%^u7*JC#o%{~^o}MBSbIEOuB9L|%APA_|xasIN>07Dx?aJY{UDG>euh_Y0
z*AE^+5@(j*0nL<tT?^J=dwD4^v$+oAqD!g~0iaT6(43g*!i0VJ^bTw6x!Q`0gAf}Q
zOzer?A|3vKOusq^&_JYM41qANO?{`-t$ah}M}K9e`KBNFzVq-;E$#9=TOa|<dAI=G
z>d-sd+`?8GOW`(N=?9#NPag52Kq!}FBapR-lWXPc1r!oA^h6HN0tx`Afn#j3%|Zr5
zQWSRO=1{W<8J7^zxM$3Y$(|Mq(+xf7aA%VNOh~B(Pzlf3djVnwB-!1!69=Me)K=YB
zhP+G`>;$5L14Ot}kKQ|61wXFvS%_-FaRxAoWB^|*fMEv4s*_iU@Wd!v;!gXKg^sNP
z7#kVDwQ*??IWDXV2=}QE-1Wgzw#c3Ocm3dHIP?Sd=%dsjc2sSA6A4O$9m6ga#qc6V
zY?60byB_hl6oKeZfu+)RiXxQe+*p{iE32pM_(F@>Ed$iow%X}DG~UvP;(-tViW#Vl
z`%G+M1E8S&iGk)f39%tls;8b3lqE$v5J=(uHJe6~L!ugMfQ$~pX@#oCG@zNUFgR`1
zb`&P<WqWBtB|_OmI`G=@wv8<wu>n$K!(!68zFAO<mduQPU@GeS5Kokz8_OEZhDrg7
z&kLntd)txi_6Pg++UeTW*4sL4rE$m-^C&IP&mjPM8U^TC%MXko&N%>^4p0Rs+vO3f
zlxM%!zUvq6x8M9XzZtBS{+=!t&@VUrod*jXylU|MrPbvZ^;I{3Ya;;hw3kvp9FLEJ
zksp;G`v2K`55U>3tGstl|Mz-(Z|ah5NiK2&g8`cw2=Ia-6jRKj1PCD{kN__UkM~{(
z1mc(uF(sxLV`GeQw{Zb?%T|}JbbI~xGQGXux7OY>bFXA$gW8p5u15bUXHMDYob~Os
z*0;VDbHFawuSsRf!SJ_K2rG15Ja)294G5tPX17_90|Y`^S8_xIOadF3tLVX)antMg
zncge6<HdJ<kHr?yvIAga>xknffa$T!Y<uw=`y-@`0FyRo;NqY6L8ZTJzxE{pqlK6p
z>=Fi<mnGnpL6(QX-xgt_^PBL%H#^}d#&?AL!!DX|tyea~d2H`8xsFwqJ;8uq{eqeB
z13$J6gM2nzytonO=0?M=y~=5b&kYnPA)k&f?y7b`0V)B7CImV>b<(s<1;$?>y_TH_
zuY0f)KC(O>rpm3bCo@TIXd>LxoDJvj0M_z5X=aA2r86!KGuD|^Xo~G+$MAS^lGqnL
zSssZ(WNNwW1b{5;w`_I=UCi~6Cv;V7Pe2jg#sp%7+LYz66UF1JMpwhW;#$~FADf+Y
zz`SKwJM0;4li#aAK&!&M!=r=J-UOh=0OT%AGi}dVB{6k_8zss}ms+2*3!U)5`fT_>
z`Cxd%!lCd`?&i>*WZcGS3r(0VCr?fn@al5tEUy^_8+3x5HpQ;fHwTcE_cpq@-l;Ft
zpVNNr&wcU<xZQ@Ee1b+`2ry63z@NHd+rMnomi};**2k4K=}D1mQEr2qU0|bGxeS&J
zsDQx||2?o#lPMWhJ_l6l`H^t{sV3S{S!7c%bQlYHE&6>g`V~WF7TwpnV{$(cmNX0o
zW&_@edo9shumvLc*kAWm66hG)Dt`Ythg7));YtiWIdZuofknd1WATmIY+vEBNCfDu
zH(s?Go8)m>%?8WLzMaG|yp#MEm>WhU?LR#dHc&gbK=Qw=a3TkJiqO>78*<@QKX@=~
zn^_DC%d3IOdf2@O78C|q67@=myYg4W$>@E_(`?%S3fZD7g9kwtBrIho!|U#Eg?l!q
z!q)O;xQW8Yt$^GrGW_>!7Q!9n`EW8b8_FzK&P;?lfTUh94axo^t)Hh6Sr_1GTlKz-
zwWIS0)ZzUA$g^iw!nI>-C<;`<nfiEmusR>g<UO9XKNoJDTMUPbi|Dhh;K2Y`n+Rc2
z>oSfvQk*1o3Omg5ck1qGQbiC#=nk4K%Ci;7IG|^Yf6V5&2&>(#;ZN__5&qkqyTjRO
zz-4ScbVf<)kBo-QIQ?nxx@dt0NU}ZhUhNgajSI^mu-z;kV-U%J>(eF;OUn<8RkAl$
z{@`bY;={+&U<8ICvM1;y*D{&6cPG%Vl29QxvWm1t<28;HXkuc9+>@rd9&-^gbSmt#
z-tu}FNN70^aaD+Ka4U-^fT4&ir5Ntgd*u<5N8?mn2Mq@fH!cno{-&46w*<64^L&hZ
zD+Uw2-&)9wr_JmK9GK(<q^MTUaTx#&ZPyAG4PbmfOj_T5<ew#X!ge>V)KrP%8>3mX
z&?v@qJGqce=(Tlb;wJ%;&DCnyPUvPEbmk?|gPHL+$$6Z)uNMBF|8z8b%eU_hH{XJ%
zvvw{#c>hM&wrw=bZlPnOnu4h?ap0jbY{%Ln6@FTwWSXXN;H&i<`7i-n9=bFt0GyMl
z@<uo^56NIU8@{|#3#ZEG!-Yx~%5(vbY674c4T~(dsg9R`j@l`U7$#BA5=4T@d@bx9
z+X%O@j%QEPc@qE$NXV>Dm%>VJHmso@y^nQoFVW_>z3BlpF$QS~pm<kZp)LTnh}?ls
zD4YnQ_R1~#&WP5Yr*(kUAc-7cay78?IwRqp4M+=T@xDf{qWE;0&(+hJ2~3j!twN8s
z9sm(=6ss;XHc5jtUY|Y_I*V(xnP4yqfOTmcGQjv%mHNt?*}a!PLATv-vybx#q+8>0
z-t2I58iA9~ocv&Mv--mFCUqRENN^}FZVWHo-WGT<fYE1~ty}m?VXow9h(shrBJ^`?
z4Vy#rv90RcAqJ1aM=_*k9*cp5K6G#}Y)auV86CZj*4w4w&4-A{6Y<QJiFurXQ)oX+
z2o-_GFDzeAbVzzAVF~k|TOh!2Z>3+DnXt<^Ri2BA+e_iM9(Mp^Z!};#Amyo$d1=uj
zX0sWNQ;dE;DhFo)6-mI%OccWyv+AXCEvy3o3#@bj04e9o;YBZ-3NQc4Srh>-gesN!
zBN*o`+e%?<T=g=N+xJ|M!CuNW8Ib$&nbcQdGQ8tlCcJN@1lzX|j;ybQ8>qmaVs<~t
zWPZywLP{gr-(0{xhc&v9)&Z3Y47di(%c>aZO}^VyV>0N=I)`IyeVp9T6#Tf|Qz6Wa
zNGFZ`AtA5&r?ya53^jnRNTH`XdbZ6C5n+LCneqZ*Skw#@HqBFbn&TSC$)hk?Ji=u)
zh2#K5c{2q<XsX8n=svT&C;aNi=EB>St`3_}lCwDo3wR?Ldf-@n%?MyaL$&UR7;@xt
z6!Nn$Y(3;w$;%u)9oh@4j1H}&OMnkvPnZ5R*>d+so4@n3|6^zcp86v&^kAOAb6o7?
z{+K-P3kxVV3Or;m%<#q;r0B_(;J5gDWSYg<ys5f7<p_fTIANgHF{&-qzepoX@DvLh
z&AW(^oXA{tHkJ_zJfW`FJ@3Y5xrPrh6z1<^QMrXrQs~W6ECb6$TygIz0aP609?(P&
z#4<gJ4L9J?*fA&9Cm!#hC*-S(5N2Sd9gtC#_eUp4Eb|;fn036EnhFJ>0-vR@^*o&@
zM|nS9ERy5~IHbP?I4?5Z?|s`wxcg`&e8bnzhN}*a0M<+4%rO+3XG&puz7R%XVVR0D
z;UJ|ftY4ad+L**pm7zo*-OPvE7aMpW023jtNtP|)iH=d{l&Mg0gAnKV4st3*x<zv5
zjq+?`a9c9)p&UlXl~8Z1UfuvPFG@B@l-G8pFGgS~&-pE&Qqam$P^-CxIap$+0XS6Y
zW^iBCf>~a%#sjl&AT6=y8pWt}a(*Sg&l$yA+Yzv2LuDPnL(^`02BonqWQe6*;n#1U
z2_M?LCR9eYbH3R(txcdMl>+6e1Z*OXXuAqg#u_aZdUhvgLT*1`N1N!dLOqb^%OX#3
z;Mr2+N7(5hz&wGr=keO~5MUm!k$!~jRJ+-~plI%YmWY8Du@q%))V$GQ<E+?1X7Xy>
z`!GI{ycdTAGvF4MBA%{rin<oIFm^J|O#zbS2pkN|F;a1!&A2bkmpCoi2wpJ<GZ5x-
z_auhB+*2?Bh+(1xMD{k>{T>?lwtDVIZw%6YI0p6NUwV}Q#XK4%VEO~+i~0x7SJQrY
zU5tRLqp*<W7&3Gj#sdL>8wz0|tDggr&*B-#d)a^hFh+=MoLP5XlwQ1n0?GJ`Cqnp*
z-(CtYe91`o(&x|f{RU}+Mp#}Y%tZ)l0<ak&N20`b#+Y6Tk1=1*PlpExdA;rUCN(#+
zgq&JzyArmtoRpr&8TS#AW-XHb>kG{=y&YB>lm9N~+?rF8Xm8@pG;380@r`mlv=NHa
zao{kZEQ4W|EywGTERdDw06<Rq@H!a16yUU_sEyIj;ejcKq|ZBy1>L&2M)-v7bdc3)
z6CTSlCOKA`XFKE2x*s~#2%ou73P;DT34eNSSGZ&BaA<7Z7RI1fSC<IoLXB4V)@8ss
zH`LrHp>H8VZAdMbAtV^s;EjRKAvd>$yv!#14iP|V&v-#xJBOdZ+itk&$8iLbS$G^b
zIo#CX2pqq8^7u%tvcI}Q4wM;g0alW#v*n@4z}kHq6Z9rmqFJdbzy$_E4J)g_$dOk#
zwbBfe7~rue=w;zG%#V=+3_LuXWQCjwWKx)HFkBWrn20U5e)m`+@YrhyINN&icm`j`
zD=Ps|#6t&zggG(f)EkeOhq2WT0F57^KYY7-A1TpkU-hvfMWWwv;P?!tAicZ+6AD(@
zMVM1dfbSVV<|5mvN#+DyFI4vNcygiA9SNHl#YH@zC2B{u9BGDIZ_R~k4gy4~kx{=C
zm|aCk6|)niFh0$B!V_4-=-#)I4Ie(g8P1g^Eljkxx*l%aD2HpIOpACiQIxuZXvW!4
z1}usw%1mw{Y&6C}N{vB{;{Zr?FP4{Bgx#h6N@Fe`vm*eNe2Zj&1R#J?2~WnlKbZ1>
zOXVmmX6ut@LO4uuHpy(Zl`P@6Ir;?E$=OH-rbI}L|G){nEAl4~b@qinJhm;or}K>P
zP-zEYJL+0;gtqusgdyEr1Sq5~Mop2;b_%RnMWT3!L18nxd#Z3$?sx`3%>rh*vu8u&
z?z<e2bWu}~S;WH|-ONp_{jZ;)2xs_sDv!X>gL#6_E)9C$(9N}fyfB8rTj9pHm{Saw
z$*f{H6ce`-B~K<!@fALkyu8UzMIcfALX{b8Nru~)t<2o@GbQ)=T>>Z%i}~-omc(;1
zrk!hFxIs4%8vv5TJo6fu{<0~$m4L=`I6??#hNbQKN`2kQaq-1Gx8a92$Rcx3#~!{q
zXPN13a+wCn0+K3WIS~Xp#Ia<Ar)J1^XxTNu8-pi+7?XSqti2_Q6J_BFu%<=;k4X_L
z$bJ_})8WkdOnChVDq;HGdN>GudEd@yayXKyX5AYA`q>cHG3KX9>R;$ihU)m1FbW7A
zSYHm;)NA1|llyr*nF*@jM~tl@wlU(IY+uqg)kVT(>|1_9Cl5~uyDSf*+aQ#L$JGD`
z3wRkhz|t%H-IK*^TX?Pwd7+8-Q6Rh}PP>N3yh{xe@0)C66JRY(>URQMU`RDIvj2cr
zjimp^8nkNicA3%b;bLibcz647c*o@aa3ZrKv<eie(nqJuj%7iyIw862uJMpttC76t
z;1C#fL{q;Sc~Mf4W`98}v8pp{D@$0i33b_OpDZ85RSd|8**652C)n^jPMaSB%;PlB
zpKUYDB!4H9EBv@ZWHKlf<qTL@ad#Pg35^IOEG(s&s~Pw70e_8MDxE2eCtqrUi7Bv3
z4dn_{9WJA*P@9`}r`;MDSoqAn86|^NOQMH`<080NnkRumTt3mb;^MYI0w!mnCC?_J
zLt?BYu!!Jexs!M(4y-Ym?=Slgfkp%wjfs7wyi}j&79Ade6HBPBBoZ2OcqdBkXVmdg
zVvOW8sL}|t9N=kwbryi#K;5`WxT}Bx9KkEuCTcLTsng9J;i1im@QLFX;dA9MF;by+
z1i2MzJh~$=(FkWX#-~tmrl1nlfUBuHxu#hN*H%x5orK-gP9P^kE3r~}h#YSL16u+#
znvf6b)R;)fG&cj)*9s_EhVYmaPpzeb>^JXo(h-W@tJ)wd)G04i3eQcvkvtXmqN{h6
zU{k@Q90iV}0eN6^m3kNQ9?Hw;%8<9h;f5xmzGK;);iHSA;avAnI9b>qK3<y$%Ol%N
zZJ;A*?iNkdq@9@|JO=PuW^XGJG9%EDFObMrqdUpdSRWzfi#S#_Vbiu0;=8ySx@Qp^
zt+5h3os4ob>~n3i_BT)PZ8zNf<1_+8fO(t-`m=3j-zS&vS-Nfd&d}VvafI3C3JKo=
zH?7sZ#TmQR8S>UV2LX-bUo`RMwfe9a#TPUwvH&GDOfK~UC2%beW~CI-K6^~wf*x+n
zO;ao8STswPXCg3C6^7J>dmfD@;nD5Tw%r4YoG$IhfY*|MXV4oOthaGGta~a(r?$|Y
z+fX5ea}$Iz23<YyFkI1gJdu<tRBT;3R>l5p9a_J}$aK5NgD5#Mt{yLVh}6jB>V&#j
z?+78WSpew_3G@q$*Q$lWNOS{GTQRUzl!ebzf!`dPA!kuSN`;*y+y$o>1B{_nNRH%n
zhEUuH26*A*v2fSj$HSLB|1e!2PqGcVUs_FxNF165C``rpQje0y-E8yQ23~6p*4yZu
zj5^+cPQ13mv9#5DMNQ{4+0Zd4$0S;?&OO`AOBP7lm4FN!513jyF-~n$*i2xicG0Wy
zIADC0rYYoP-DmmiSoT18Z}wpL$kv15cxMhaTOl;nY9K@WPF}Il&D1UdWdJ60RPc!&
znUHcEuVxplfs31Xb5Qll5D?U&8?UNetyzw$MTLL+!eYp5R`JeUcs0WqK*SE2)%FiO
z!MELT^N-UA3<2hG8tBip8A4aTT+MfWPnFe@f3ohDHj7s*U6tc9#x5;yK*KX>U)qdb
z%W_j205CHLqi6-9%61v=SOIS#viSQ*(ee;j1PlE%*4Jp8M$3%=VvDn!yeV0B1GWfG
z_K5+E1C_1M9h2T){igSsb~ydUJeCw5^Ke*netOFoQNbSO0@#@76FcrmV6gZ4pCsum
z*~FscAru<)a2i$p<Rj)~9Ql##_qy6q<JIiMa!*soF^%O`_5T|2olO-WBI7ebRnb=J
zPnMW{SJ2qXv_|~As1jdNC>CO*W(mY?!m?Uly%;|D_e){dYB{{>>-L4o(m6s^Wk5i{
zCy*cnCzhX%88FKMGDUvhRA>|fJu;76O$l}z$#OHuLV4+eDVL6yprU>i*xDGxff9K^
z$Jng#J9$G?*J})b1q1FpK$#;ya?d*Xr|Lw=?L8EZcBjMNZ)^|uB6Yb$9Zf^j<LEM)
zvOt^}_!TEXB976(8!$Cbmt#<<ZQFP#OaPP%>nJg=P}C{ch`bB{-d$W$91brNJu;s9
zeFtEj8XJGl!f*XwC*nyR{&|Xzzz|@b1SfdCmH(qKHU4{rI=#+`VJ|}^m1&GM<QChu
z9Ekymi@obRDeDSU%;aja$xSRmKwZqBnCUR@#@r22o@0p30HN<Oq~yKPZVjzsm@5`|
z8l~Cow*9pxR6St1bO6okiOUV<WZZsfp9g^_e&!B|SMA%1^~TqmLS-(_;`^Y7i(baX
z3|!pvvdVkyU-YtMh^@I7=sHS1WS$)mkZODoU4~KC2qT26_Cr^mLqqEfAae?fzRG8{
ziwjKFb77vub)AHFOIl$STm^fu9woHZWWi(=@HW=g!VG-iPu;y4{-k{>e9yl)3<dZs
zog>#NI3#2Q09sOB()j>^_6x!>o+qBcMmJOmkLI^x%qa-fgh5O(5jr|*rL6){c`bMY
zh!&&bY*&DJ8d^pdMIBX$7A&(o-jt}>RbpXnCM_4oX2Tubec`W`c7(IXwuIK)wy-j~
zl`4Hgd&-Mw+?ANeLvh^Fy5)JvWscujivo=rVK(KAjN{HROfnv0b0qXZ!KF2{$=Id_
zfBP2b$#^(?)~ama;h<75K@KMre)rNVX87)@JOc4Hekvd0m2qs#Up)64)#bJCZ<0Vh
zdj=}0+A~Uh*XFy#?l}wzS!PTFMk>rc6Sw)WLME~Y{BM<I)^TCRSWdcA4ho*Hu)<B=
z`|l#i$V^xsW&lJ4L|V{3bBpiZPJEqyJK(Ko*`7BPKTiOYKq3`mI?38SlU^Q2jkomP
z%$9(~vM!fC;(pri3~l?zkhX6`1lHpHbXY(Ma19wx<t<tm$qI?84XWEWsba5SM3*5(
zVBcZlsc?{re#Z7eiY?EOuUSA5c<I3Yu(11ZSfAJuDq~aRT}YY3B+J6w%OZB)#-MNQ
z;PJfkZ^Hh2?hm^Zo@=ayo35V@-}8T5A12$!sdYhhm^@2?^U}qWv9OVyR24*Od=ZA^
zfLgpCoyXYYahWG#!kfS=`=Y*KqOlk7xTw#xyqm_$NDX2Qi<S*YSs^R(G|OBVM=8<#
z`_AF;?&>w+p3F{aTyTRJqx{r7AVj-uX=G~P8Gp;=*`R8|qQtzNh#0;U*eE=y?Fo^5
zQsV(v&m|#n2FnED&J9A4T)$c3clGr(yqQI0t}g(xgv*+(RR-`?;bU)j{lic2@RO(7
z2n+${NpePy-8#EBjI>S`7b;=#VmGWRnIc0Rji?!Uv2<hx>_d+aSu)-#{S|J9orHy5
z17I|pFPX>U>oCa~CGVpbW5z!kb5HaqfQY%7=pH!mI4ev`d4R_O&GygM(+7*RUhS6_
z>;Z_a$L*5!w~4>G1bhb0g-7xFu1tSD1Hd!@l(skNm@FToomAXkMx6Ii(@P(8u<R8I
zA1eYG*j9BlDZ@CPsWg#n0k|>FIl^MAgnLc^LMQMB&TW|qOWU@Gjkz5v$%kh=Wkdkw
zQN+-iI(IURzW=@9nMkAT26*b^A{LOtzyA8U@O|HWW0=mIAh~_obmHt>8O@hnl;l2O
zLj(*!0elGUNjsoK;y;gyKpTFr6E#?deMzFp<^>5TL{nF;%?6j&(IC&MD!IH7m~h42
z;T`2|;RBr=s1|Gmywn&B#Wb}!5RI6eOo!Z((@N4QF$A$3mU%EbyXhnu5NSJxE1oPf
z?@itn`6KH{K$sWiSmwcVp}2EjsBhVV04B$<uoPP7&V~#uyUybI^Kk557(V*Chn^(2
z-*7h`qY)Sa%wsgL|CrS-ylComot4d3REU?KI1f{YFb!sg`<V*;h%~a3g&2mCUGSP1
z-RNdtjOnuu3#o<?8C49PuiSItbp(S%zA~bfjPxMb$e0<JH~`6(cv{1acF!Y<-#ReG
z73g&$0K_ZHfx$r`;fjNkuRl6PZr^+{|1NIp@^MUcFv+V*>qr1(`<ehoo}0<*uoi`y
zqRftVQf;bfx)?T~@|FP^i8rqE`6rh)!fDOS`O)MUGkn~i6+%?!7}=Asv{sQksg@9M
z0faOd4Z@lQyo>pbTDXd359G$eJYlmIfOVl;WnD|*>ci9F2fp>$gy2rX<v3>fA@y=e
z-86HA-nvNFbU3-@$H|)l+zOCBinHw7C`t3wvRL(<YKPoJFosa5UN<@4s%yzo+j9&x
zfjjbB!bjTk;mqta!_krb;WS>Gl#!LAg3e_$X6tyW251wLe3W;3ylp$iDGf-(t&V5F
zzSj3XVd?ei(Q+cC5W>qc4sG_cxNsrV-~P7@b0Lh~@XU}ucztM0%)xkrEO7R8Xr4Qc
zr1i!B5&rIvfAsS6JAD7t8i8RB=1FuuH#WLITO299B9H3t_!Pp4<VhiYnzfa_RL@h8
zu{Hy(0=zhL_x!chBor}}8*0a{Nhn4`wG;NRW?KE_lM#>6F#Ap(j`<DdIrZlsK0|iu
zwAGqKOwkf1Old&n`{Q6V;Az;t$~HM<#X8VF!1T_6fA894o?^B=xCCs{dR<1xc60!k
zG)gK;bf9x78pk*m($%iRU<V2SPM#VP(V9DeL7BqHt#cFM)*Ud$7AxU2xs3Bv{bTx}
z6_>&`mXlW8a&sdrZ>~fC#hCyQBpNvjpj<URA9g_NouYPRtX(wEvdHXS(}QCVu7+Rw
z-H(Uw`}Ui|p=rQl<qY*M(0tYUn4@f!*&nFLyai+|4i?6EQ(s3z4Dc(?VZg@-eTkkd
zpfv8f&a)V4fTeOWfMA9g+Kl97mX>kUa^d{?(Xa{YZQHJ^!UiOTD#zVX;SYLfw&MU(
zr?H9BGBq#g*0d7&L*$S&W;S4Pj5V<_;E99`Eu}UCYUQl(ngAS6Mp#z7>gv!ub24mx
z>cb&-{CF6@`PPs-bWO<Zct+SfKLY@N;z@Mp4R__S8iBqAd91cDT<;e&0vDe<_4?+<
z#y3?6bDTacEiFPCs&BCFjZvi1LR~Rep_!|;U?PKVZ<{cOmZU3!At*2tmuRAFD|2KI
zBSnwHiZA(<LTm;o%H<@lMyrfhM*^E3Q%pc(8;i!@aUrhTKX$K3|BLI6KrxtyNeO!!
zpt!uzmIX2cSElo%v`9pf1a1Q$=VF~HmlQ*TF)V0@>o(Ay7B<sgvE`r<*A!AhY;+S)
z8JQU;MKBURbZ$KyBeWxFep1>lx1uA$z?a#_B}Q(M^==q@uNw}I&V-vLwujw_!%Wo?
zyR7D^@?Q>XnYD0^CSVVBX!5nS5dPJ7JTF``yBgZ-s0}yQO)6jIwPIh1Rh$86+Nyi8
zem;07SnD=`Q=DO%pvr&cZ?q?PV5;c142Wz#uT*c-BK)@i;=Jb+`fAlZ;jY@=aE4Np
z?7l-G96X3}^9aWiy%~`KbfV-1&)}6oKbDT2<zGyyTAe-n=q<GYQ{mQmU>Td_Ses@2
zhR+y31kIo=ONj#1tF4FDA{)O*Cs7bNH@7uR-SnJLIIuTVKXvc-Y`psCe(wvqIfe^g
zaU(DUm?zmOd+?_5tBR%e{aHdTXD;NzMGEC~Pp67}?QWtZo97{b5h!rCJNE~p#AlRy
zgsQ5+`9is5h00XP@1BpNfnE!IQn95eHftTxa1Vycdijk6DiL4eE;s<l(=Y&w9#9-x
z55vWk_Ao*q;+LOE*u?c&C5~pwm#xQ)dqmJgWlr0Q>7aq_3pD*W(|RmyW@3ySp@2xB
z%Ai;cj%q;a`EcQ4oqjS1hf-nRo}URHKeZlCB9I9O%s@ulkwMpQns_Ih7*T=D64mT#
zQ$0s`?3Vf6;ns;=VRVgiP_2ZO<|;bK^lqWb|3MP}qf>?O{oitHxOoqNu>JsseA5XD
zcpqKrajZ+J$**9nkz&ztsN1sacOC)H;vBUzMX6qLwz2D0OdrQtzjgQtv@4hn?|f)9
zy!MWKIN7EP<qEXuZF6Dl*|&s^DcT*&b91FkM_j^WQs1;~+X{6-#aV+EF|S1)i(Yfk
z+OEFWj!6Yv1(qtRgsPmMMF~L6IFd3M7<qXT{b6AtG%tuzR}Q7^+r!NK-k-SeTmSW!
zo@BS*aCaW%5f}o@qdcIG#_E<{Jo~5BwT=HnVVL7*Tc`rdxN1hJwObI#y##q55foAp
zq$Zx2C7XF~n1lzjUzRcUSOgif(+RYqflqlBc{Ogh9jsDv-|?ZJo9THs35=q*;(*~^
zj02!M4)!vJ$PCc>w#&bZX0oR>7AMN-0!{TWJ#WN*(+5Ofzi%V8CNPjm`6PSoUW-3f
zoQX-+>>Xp+rT(Q=D})ORRdQHJydZ2@nwk!Go?i<`%g~nDUor?5vNXWg%J3HIpu-B`
zs*BwQ-imTw`EbMZw(y+!{b8y`o`gcgMr{M7no2m)t%ak6q`Jjgc;z?T9KPg+u~4X-
zf-Vi6m+#aLxe9Gs`z$(gSDuXOWz4gLz#$c53PuZa0t2W3ViR({#4M9)UEM;J@)$Wb
zj%TeoN3*aHe)~?U_1i~C!9WyX$-==sVdA<YVRIT_RW62Hi}wd&ET~G&kb9EH<|FJ6
zg%u5X1S$%{>8<LH<OSCNE1Iiy9=JDTRw|*i?{LUKglNI1mO{7)09itUf%XO2rHd3`
zE|Bc6ge`M(zjxtnfBHR-#yvA!$CGmeh5+*<JTDJDYwAFDq<f5DKVD2GEUvj=p{X4U
zB+LVGV{fc7`^<$g;RP6bR1)$KL<zI2SG>Yw(y@}U_wbr|8}{A-VbB{$IT<%GeWUI9
z5vM%&lmHA&QfSHtv$xI&I{HoWBzmE=IPTGj@$Dp#=$o~^m*Zy6*AtgZ`)7X*Jba~n
z)?1$h;onErTUu6_Eg)4mO21>w8g#-ue-VF$K^&!VR(7fsPOcN;T5p9506nK7RAaS#
z!Aj~?WZ8kemqBucniI}MlO5R#GwlUb$zMfJ`u1`;R5xpQJGF2uQ-%V)Mqa!YzT$;P
z!Yf|7CrmX@P@K4kF=lFDg|_H@DPP8VvO0OzH5r;RCtA@c9gA3IooNDNcsUAlDlMa0
zA#nlJhM=hs8U%oQ$}{2r``BpsQ01AS)}%mmeS>QI)sUZ~bLTU!4Y>n*LWTAPROy&1
zgazI3W-{tpN%v3X*9_}5xv^Y~do1w1&5KbhiA)vP!l}}DwDD3p=T8y(8w;a5uL|v%
zEuqd%bz4>14dBhdgv+d#p)OwttqZ51G_QShBERKpHvj&O3s1sZakxwW#1R+*%s+7e
zAFH1)ykN_#E9Lbct}u0e2rp(6V<9G)jEM>b6F?YsSsqH9y^8KCBd(f;GDfe4Mbbxk
z6=CvM;W9J$(O6q9$7cO*yv-|$;h<C>lV-BLkXbbH?x}cy#=$Nc@CY)4;NjG2Z97t(
zBT5W<Md@O6bDu!TKXnlE3ici_MnFjnw^T6NtJl%EBritCIN;IPzQmp>){@%5h7V;V
znUSw9X|9h1jJiNlY`G!=6p}2bNv5A?=B^N=s?)VI8m|sUw+?4`nRjQ}9#*41K0Is7
z?r_7*?oeEBhw|nIo=_v4BW$E_*;02sJnx3B;d{U4ny`D6S`j*3S{W_q)>X@9#m3mD
z4(zZl&BVHe_#UIKyn2>4$i*4$ANpd^L-X_sAjbzy4F5UUWWRjZe0bCPb12T7B}cLv
zI_yW5%6oO5DeT%7rf<0^)SzLP8C%n%GercZ8KV^2C}bATZ%Xd_{6=HNqy@lep~Dsm
zQ-~~2ks<;_*3O688GvR5P@$7$Y1j4;CMNIz*;jev>V{b%G`PGFS`Y!k@nffP%f2Yw
z{?=oU)%`SF?-MfuLx6c=#{D1M>b=k0GEyAto-b^yO&>qi4(EyK*Cf%wKP#X#W9>#g
zy_V55W1xv9uj?3#8Z%+@XnM(hS7w&6HUP18BrnEitH~p9H2{Bj*h!v|O=WxUlXs#u
zns*d`;$UN$lem@$V7ATViNs|Lds2=?t50ATp(_EJp@NbCu8WrF&M91{rB!3&?*$sF
z8R`=odH|UaO2;L`Y{uEcOw#`n7z<=J*C-g2^a_c19Z40dzJP%|hbpsJW-I&#6KY3p
zpvHc;7=R5>?F?bIvj9Pzo!LJ=8*Z7~8@9EeFOyr_q`L7e5cN=FJzOBszk7Z(eD_yh
z6K*?%J#Sp3G1w~mgJdamG4m=UD5+jPbhB&&G@VFIH@Pqz4N;k!FvHX)S?2(JBi{}g
zfxa&ft#1jxc=zt`_U>(9CzbT}CFg0AdX!B-21y)wNn_U@2*sPO2^*6me9h5{BLWa)
zQ6|WZN_))}By2h%Z8b5)kc4TSp@0&mTYlT_P@kMMkoM~U&uVo&G*8gga+S~+bm-jH
z9U(J4ZysKTG00OZ)FD69I(<BJKlSOOVP^X?!^ht&z1HF5DKY{>fO!(1$aBx#`mgGp
z^8aWoG{VvIEhw@m<YG81pJI$KXJ}c`x;^{Jav1`mJyI9pHy#u7WWNK9Y;2ssdx%Z8
zJA#(yV>}#}LR$_xF*Igg&tPt)|GM{E``!U0X_skbmP;|9@_|q5?*oxO>;qkInYexP
zgd&(FU`hKvxNHjf*>@2@difX~NBqnZ?1F7Ye#{GWfR^Tb8^hTIDAb?D_K2i@9fMzE
zJQp$2$1&h%q54);ys7cxhx(_Qlqyy7r(u8{Cvm^QdMBVdUpKQY9D&X}LLqFKg3J{r
z3a8O3I|th;kI>~;K6f`EvmIe7f1bu-7~UFUnHYKPqC7XzqBFE0aCL%&uxw8qQKt!<
zs{adNHY24;xdkcycb(pj2XinyH1#ZW%8;6bOkj}6zGdq4&a7{Q?#42+k<E~wCTw=&
zRUvbDf2g4r&=j~y15I-KW24B<s7so{aDXr(maTJ$SppInwWQYg03wB*y8)4D16D0p
z1V}my7ojYlfqAeNvaBP&V<(`v&Gg&qVyvyKgzkg)0WzO_cewxWzwAkTOAdGK3mkzV
zz<hxN_INLT?6xhpXVx~JdFDbRoTV;8v*?-$5^ARH#ye4#)5$dZ$g_o^lR2S|S|tzG
zZ^S6eO3a!;bYRjttRO0aj4@XgUylG|kto^Mo+mK?HiPT&#V+xJQa;B(F#bOMbwC*K
zaMJGwmW!Kqud26>NRbst#GKF|2|!*+0y<Cni#_INPTS5Iy0|c=#3$YPaIZ<tx?H*i
zuS9?XJ4(OL8ErT&%g@0Vbj+wVK{xIsssQwgBjDs9uZ^j8v!S9__WT$|oos_i!E7Nn
zapUavu(vpevU4M>0wOD{>oknB)y`75{@`f%j+Y+}*Y4&7)Xs-&4WMb@DadQ(gqZiq
zj<x`rj^$_Mxk!kg{0-r}0!%Z3fB@#9@_6{I`?rNZK6_m#=XZorw7_T$fl(JxSt`yP
zFCI-7hFey8X&Z=ELNds0nGX5u50RhQ6<SE;s7^?=EuFD3Xw>AT^csMXcV*`i5G|m!
z*2ZILQ9G2K-4gP<cZU{0V+Eq*Ub2MivdaL{#nX5#Y=_2US;B|K?RYiF^JF$w!{!J6
zb|YWU?`pmKj}(C%KAr+2Fa($<`AK~6rmfElrR;~hE34t&({&h57&hfsHIbEZPtTa!
zkcl&Z(QI1sBK#G|G@`bb7-cFR^a_7B+NP!w$HBs2@qN#zCy&L#WYHr@-b?}*0|hty
zNlwPVD;o8l-kEb?djRMDNCYpt8vT;=ZTqr4V2b;ZydLjyqlFXwS|Kvt3mExpnrp%m
z?U!iD`eDp3xd)@~@dznEF_BD#obX0e8E?g!`rInpB|Pb2LZP`>z1}W?JO?uND|nX2
z*6V;xn_@;jgDjx&*Ev!KtU_JVDe6m(Gn-$OwiqioLa6AP$*t&=AxgQyc_r+0jw;Ag
zH4^ZpM);cN?GE4Yyd7ct=mm%eJVA^$Rz4@iWbKm@|M6*}n2cBh5UOZq01(vZMERZz
z<KZ=D_lJA8d=bT+0Kn1;J%gyxp@_3BPshTIfRS1|00dbSnJobieG%G1$j{T>7?8=&
zPFo&G+GT}p+o;MPL*AdWBf^2y0n`?OkYN_!%2ZL0zDPc)rUqr(LU#9F@<*d^0swTD
z$w|&1DgWjLfM#`#{btYS=flLI{b7P@WAh_-?p^+~|9LuIj>A7!_y`OE<_b80=bp3o
zSF@e9A9?WDMmV)5rWo;cW|d}slVLZn#I36lnPd1&rnkv!0gQ4nA_Pdj%0gukBt(HV
zW?Fn`Era0Y20ns^UPoq;huBhLKZdm|45nqGLAJed<DN(X32X*|$T$E!&qeE0u0~-%
z_i(g#wmk3CzV?&({ofi8#ORTMr#loq>T(bCBEaP3ndr509E5{93JC!S+Upicb_1+f
zplYvUJW^d7BZI)urhx>@IXse+u+5fHHJ12s2SYBZfIOlWbu$+MmJ=A{)4X3LM^VIR
zUo*Em93GhuS$?}tCrq_$zF03?k>(+qdwuJd9|+I8ifz@-phdQ6Ik&cw|5*1X`5{&R
zm(+Gy0|e_!+G_V+%!H4e9}VwqZVz`HdTF@0{aQSrBIlju7GSF_K@(;bS-?oSm7apN
z)Sd+r$lr8G!Uw>ji=K{ocovr>*Rpe4$XvBQWcKVv^?3qt8Zod{;zyw}E8N5j$}cX5
z_IdI%u<7!<S$^AA_Lm%#fCiw<ZnFMm1VAt1rCo$o#|TdD*oG=j@s$^U_ZMG#1>Bay
zUHm*oU<fdu#~?i!7CH6Yt@qHp>DrGU-2`B0Wkvj)=!_W$v7MBBrwx?tYEs0tWIvyY
z!iqhyP?;VUSrG$BKwwLFUZ>XttZzpL@;h#%ZMhy8IIFE!&}bl#fFv6A2wWbrau9M*
z^8At4@%VnEi!pgUgUjkm%iGxU5wNs90Z1<|)Axi-<kNCF5tQV4nKu)`FL^K=P+p0A
zD|S@w0wWB_l*m^osAWD#@_Z88jNf+nf!EN0Bo?DAkbB7yHp_?8OZBj@+6)lKESxEN
zvCa``ZAvb)B=`eiGrDO4pcT&B{K!POW@<;+k(~+^ax)b|YMZpQJYQYGAg_lPTsITG
z=H{*8>g_ZU%VD@%^j^UOsd63U0fZ5NqNZLkI;O@biT$nN=iagq-nxts=hZic+_i!?
zg6IGN002ouK~y&a2v-xb+78=HO3&n5)*7Kn|C%;nA{FGEYFqeyOPw`2X61fFE4KO-
zz@}TT;H?lY<WSWQrh}|bcK_~>J$Q8p6mYgAzb$0}bNC3~N|~U-y78KtFxM0YQ|={f
z-5FXmY_qPTg#S9^WpZk(44W)NjZ6--syA}~DSY7d|JRe@z8mhtKWzkt0P{~9zsK#@
z58bfq`MK%t-<^Nx0tSqnET(;pkc${URv&`_Q^B3vNtamH!;;thHIPxrCgxwvi-|zu
z$|>$C*w2)6h#@o!d70M|LrOgmaezo3!C)xNyJSYMBCRL7Au-C-3tgpcdh$N)M+72U
zhdh}mP2>0yb3+bniM?gpH&4ZXW6nlgrxhydcj<d0qPY^awRmPA)MD7E%7mcGoAlG+
z4H#qops9#r#1jo#z+&r*@x)H_1T@3XQOMFnDtVbW=5fdw3(NvgqOly9Rl&2~GQT6O
zPIZ3>nANZ#rXQQwIld+A8{ZiwDd21p5~~opS|mraR9g<y`FgnR;AD8oEz^N$QW&Gk
zsfh$l4sS+Nh^pE%6aFeU=EA!k%7kD4_*%HLK55P8R8?Qxb0`$AyESz89uBSXNxUok
z85L@-E!nfPx{9}g_rkL3JZUvGqDzxs>B45qHrN&o#4>nUxw@2-k<p>}rA0gH{GPob
zf7SIIz>d(MNE70+)H+o+>aD*4r|uGUFNAqBXrkp&SuSkZ5t=ADw**Qg`LhUQmXMWc
zudKm0*q~mg_UGZFfA!6e+YL6{&|@+JLx6cqhV`*o>EdmNerdhB{NqOvPuzs^x<Rqq
zia<ujR0(`Doc&Oj`ztb%-U3+?GVPd)v6>jpe#*r-z@%J8KbLX|z!(6;r-9ml!57rX
zOBndxqe+A7?Rzx|m=ZHB?RJmNCQrzcDG^ZOdQ%w9Db^8SA~1TMCGKCOD?6>aA1YHr
zHUm$Q7?j{=Aleda<ZoUKEs(h-_89dY>My1cn%BFiX_+6efXk$2;@u;_>59gy6*Hh&
zG`_kR`x;c|bM&r1w}dx?S489tpaHVXfYXnQ-=ATnr#UC*0hwjInjD7z@XX$@cWf74
z1D**6eX~;|Y_=9EJg4)uaMQl=@SMYA;n2=(n4iLPMW0Ndc7YZC&0Xd2C->LG$^0x0
zzP55*kS~F$R)r4TP<V9rL4+uu6}l8ywkK!NIYZ-2ss<$a<w-Q~I65`F6@jywo0+$!
zzBC#FsW$^Me4nFVPL{?xITeL+4Yr^%XZGRITn!6t=S~O+MfOQ?LS1=`3CEYKltTta
zSqm`B6ljUCeJ{^l)Z0Lfh8)mB(mw~tWL5yZ4GKapF26B+?63d%V{<nR*ZO#jzz|>_
zk5PUUw{h~;o$pv(Sp4$Eb1i1Vw815wO@P77II8ZhzKk)<EADUG^%%Qi^nR<y5Slz5
zK9}^rvBor`*K0HEeP9sic&$yJZrrCuyJuhqTzepl(Ql&%<DSugS7OFL0!I%pqURKU
z>y<v20$IxcBmlHSG&uBq^RQV0Jny$6Z^V1y_%Rvq@R{Y7*noLAI(~Xo>$?^q9fh%q
z(&XYd4QBI30Fb1;9k2t89j-jQGzh{Tmb3QBqUK89%;?05y^-vP0I<B=3a6o&ufrQx
zSK&G<5hc3K4yi@J1ts|T`B`@9yoJp&VRmdj?3>>UpiE=bsmTFoR-i7IQFE?U30V<r
zQ!~kVRF-FXp;j@cNoyQ$jfJK0ZFJzI?t~DUJQ5=u5O!-*1WEBO*8%ilI!=z<cyq`f
zx+>JirU|>!K#hGU;FWaOmP7pl4GO4HF(4%;g^OoxoaNal|054ZA+!vjk%IvzFQr57
zr8z>2<{dji{>W9KvvXJIj80h*s=S~MA+{W$zdYVhhnC768j)#&QQ8gI?b%1mf{D;w
zAOw7Y+|4pAV?+d4+5DOCiMRgzqj(DqSM@lKzz|>_$H9IyHg)g8opYH|?Gu%AD+g8@
z6#iir&eTkg*YYZAmMnc<OX<5cwAr$05*omGK0rp?>uKEUNNNBECg$_>b0a<D9gX-U
z0ApK-A*krBB=01J-8^R!y_ZN3NS=mm$1oyovmbi%WwbU2L8nc7ZY4T5y^vViW>29Y
zfr-|wgB7S)Xp7%Rx&9Q!8v$gb2wWG-Nwao4Cii@_y*!?b?O*g-Y#)qC;9)s?Z<I9w
z4WsO$Z=4X;MJ2p4<m=0XyUx|Z`dSxnhWt6}R)$Aj#RkvwOb8ZWgCXh+*mUV7L--1#
zKQ*&G?B8-QOy#CSm3Y@$b%R9yMku$-XpbpGs`NPLUHxRp<usuGHZyn+(pTeYJx!&(
zrt^<+Ck_U90#X1?7r@ERB7k}A4WWDF`p}|SvP~|@w9v@y<irR=_NBIPh7ehmu^}%-
zTWJl5LW^o*nNjb6I?_6@!}4^jEa2UAShgXM-8vTv`$_Qc+ZzJ9Zjz%>Uz)CRIjYA&
zr>(QRF1Z*}4j3VHmfv}h91UcE#dD$g&^;kMcyw`meA|xl+kRJ<`0#Osjld9KuCNoP
z-Mwe`mRsuC>bvV}m6;7<Jm&zI1;Qy3xpa8&<hJ#r7u-Z}@sOFDPCb)HAaF?<Wd=40
zG$LSJA|KQHE{4YhE-B9vzw3jE)-{m4H^7KqPJG>aAHmCk&E@v>(Dw;V(z1y?=HJEj
z>91^Qd@uVh`Fqv=m~l0oEfKANP|=P7x>5-b17IoQrKmqlg$jKwBW9Mf&pg1Vu9Fsi
zb6af+0p`(Y8DnATd*0}^0LEN5iB<BKe0D}aBgN)*w9L-b!s==#tP<Bm)!yo9T5Ml!
znsIW%Dn|gvE1c*3MP~j>+DKs{?3q6h_U<5|&I&gWmRyI<Ty9nYDnJH<-^>89Ku*6v
z1*WXO$##vJuCNx@hMrdSWMY;&7vxj`nGX4zHXf0H#uEMXgee|662`87R%q`%7;1ot
zgwK>iDFPzZv&Tc@><Q{wDmZ53Z$_!wr_(2g<?RMk-Os^f=?a=*nQpTT&;TB6t2;dv
zO8XCn!eInA=g}rpnXvLbctP}dHX=a>W?NXnb74%Tc3%}n4_ymLb;8E&pA3!D=YKx@
z)o=eyJl(@TSI`Iy0p<!iS%bTJ`_|bnFNezC<m=5`8Lg}bku_PwKuPW-g|$2+CVwVU
zmwE5{%05T;rIYs&?P3os`e89&MohH?Dv<$ZYjHpr1SYLoYroVR>gPxL-o`*EOzV;-
zV4&h*tppxP;zesU0ErB=6q3_&t}_<BDO-mhPf=kmzG%Z*GRb@8X5`iJeuQw2@=%(E
zk1~c0iY!$FBag+38cmW#-U{Qy%6uE{K^c+2?6!l8(Q^%H(*O$L_+A=f$hoM`O9szE
z@(R_9oU03T$E;AFgpE~RhHe32g$*2Mq99e~GOJMm<~eycta}}>85x-iyZ7E0wro9&
zkYzV);LWU+*YIdaxYOsQLA!t&vAZVwZ&G0#sTf~v-iZETn61RU1-i)F&}Jap01S|(
zsgd6iX&_4le~#ABV^>`RAl!ib%XafTvVck+uxKxx360}Nt*DY<SjZ#s-$4|#rGyoE
zAT=Lr6DsRMwa&7CIY<i1*|hNvJF_zo4Xz29BS#SGr0JOQQS5sc@&G-iLyo*l{sK7~
zkpW<xO|cK#Xj0ZWd*+Ys`;Y(Uw_l;R>eG6+hX8X0pZ<@{PJKr=TmOB{G8bXzeEb4c
z-wNqiB_J_x&TcO>#^;)y>%9km{NYTjn3sr#x#uYifP@3gKp3kRLW`hd>@?4H2*S+3
z#|CNU#U%TiK%|$TkJ~q>>U&lF3jYXHF@)x9wgHyeK*-IG#~lfGB6!M!(SAvtScz&q
ztdc(_|HlCt>0*s3na}>#Ax<c<pH&`9IhPm?Q_iJdoau}+O}E{fN#U{{vO0Cyo-9{l
z-8cjtkqG3?pa4)`?1c3*2w>vP(AUfe05y#CGBn*&P|#iHtQ#=T<Ymx50}|HxOE;A;
zN-KbU`)&%882=jWkk=5KT&Aw1OxUTxxo=pQ3z$|IOJ_q#{CKuaq>^PUh?*fMqH27u
zl~#L$JQR)4@QfN2JP+x2&&`F=!_Nql*WVJ#RNbppbO9h9g__;EcqUX%9Hoxsf_X26
zF;oI5!0Zwd6jL!naz6uj<nXqvo(ASw7tf~6Sa;_pL+RRULiW(15NJXsMr4b;Or}gO
zXBi>RjTJ;WmqV9||19e}c<nW(?)xwQe%}>*t3GXaeF!jD_^JQU?D#LVv(29xhmCW5
zVkUg_lXr$mRlZ>~e6A{ratxYT$jli*$%oP54+2c8i*YZ=b0D6B=~v79`^%H_DM#W6
z=Jha1{dwTGF&rnGoP;rBzTI}J=Ls1Qxmol&9S1i9s(8+N@-Put(y^pk9finLOezmX
z75OSU7l;@r@aT{am_P~zlSux`y%JTm=kYdj7-gq7yQh+~W@QmfmD8}DR<b_=jOA!#
zygg3Wf!U=2v`r}=g7eZY@fjYC(V6K~+Fk`f@MPAO`76)H_~u!_B~K?xYH4_@{7Q{|
zTIKUK^`7Av7A2yY9}iRW2g8ni&j{mlJ1w8HOxx!b#5AjQsR`h<5n@t-WE;M4EX!q*
zz`h7z%&XDWA>ajWluM!d(n4D;ax@jdrV7xY>MT{|(xIyfpWPDLyN^Ic7MK*3_a_Oz
z7#jCKgs3HiYjQdT<Y=-Iy@Z0S@Lv|1v&aA~@-kV-2_<S{1SBR`GcyxPS04#ED9-J9
z7;TJSOG*N&@L%5y1?p$I>x-c-kg9h-65jLYw_V}4>(g%chXC^wIOXq}pLlaMpZnTp
z{K$`m_ul<L`1I@kBFu}WWC~UK>kKl>FqpB8Q~5aIb_snq;tn3klSm1F?<X@DzDmjT
zRHq_uMTXrU?qMXb7|iMPx@sE0^e;aHJOhsueseH!@QM2sK_~qx{oa4;%P2OKjv?h?
ztlmY(FPVF#NHEmeLkp9ELiF&mq;^q>a*VA=QwA^s76FXHY8Y!1#x(rUXWFQPih+%A
zW&mKid5pFIM*3y^P)Y%8L`)3WQv62w869nzx|oYp_T$m0E>Zi`29VC<h1_0g(3%<T
zIgG#57|N_`1G-w7<DuLV$smdnk;$LI=<hsu1Kl!rqR=uDHa5CprCf!^UZY(D5;Hnl
zh4U1W5{M~eC=bWZl7~rcY~uTjnxC}@akXe7M(C_sqgYa+nCw%2&w((0-7`auFjsj7
zAVEf^$T|yzw<;$d2+flZa7@)u9GfJsLoXbY`v<tl=XA)qWT}tILP^d+an1l>O+sm%
z$vJ@LD#B-1hxS%Na(Gp()eRC-Ys3jRV5&hJK&vb)EZ!47_(lqDhL0;{1cm@}rJSkH
zv7hg_=DMkJdFA7!=REhS7ysnXgkSoxp9-rVd0&`Ohy;VFW@R!mQu$T*LdJH`gYocH
ztkUlRO!8XXU?&eEnWe!nS^^UPOl!UrU=rAv;g23jw8_373FzEr$9&IaH7;?Twyt>9
z8lVjP)+_y!$KxK5c~Nm0fvmR}?jOC5Yl>J{5ig;na9RWwV^SHwIB+B{Mkw0P!?3L=
zop1Xg7Fl%1Y#)sit@;>?N;wdTZ6=K~fJTK&0-r93QB%ll36*79+AE9+nU_R=DSRA{
z1pG08ksnf1zZ$@yBEVtwtDMhemTi=#!|dMc!q$B^gpp~&S*;SGGpNxR=Q25t8ls&}
zmhh~cw8$i_FA{(>9aN{HZ$!5?zZbw&lQ5U))N82Gm1V+btEdZbMUhV_9Xtdb`Rb5|
z6i}bq6552%##pX-^8QeH=t1bt5CTdgw2FZ;Eqye$q1MuYFqu)CQ3%LX0FW{@F91(>
zW;ztEyDp4edtK<v%u%b!6p3|omuM+KxU9Vf5#Z$M4}}lE=|xY0+xBU32t$B*ik#-R
zKL5*ZJ9_5SpM33q`=$LgWYd4`Ro@p%rynGVEssFpqTVDjuwqro^rkXQ2b+HVOJtEn
zZ$iM+50Uk~7&p<8@fATSflkk>vG3i`Yx%?|OU0Fg`Ea`s-4TfLF<y6*Aj?50dN;-l
z>%)Y9Hvnt{c^&~o>U(3dI;wFo5OeCTjOsKCjq$q324ne((U28lGjf>zisEcFA(sJ+
z_QhI+`sb-PfxR|BVVgaGalj!;Wl4YgE3Zf!AVk(!hBd{rwvI^`l^Gqc<0Q>LLpMyK
zXhvx^nBYJftXuC%%cIzg$HH-~u(8_MOqhV`yKUdILTPe8`50t{YJib}4F#7Z-P<JV
zjW~g^x2r(P(Ztd;f^Bk7&ZUDOAekFwYeZ+3(2c0Y2y%Ao>x8Yy)u{b5<C3F>rF890
zAxA>Kv2{zxkn}I$owQD!3^io<RjFTOKSlwi7HqYK+&?^;u2^L3PY#gDtpOrXo$G9)
zH8C2Bu+GM>xiPfnw-NGV@s)Mlw#CpnbBuzY<?jridea{~MQ+`v^kEDE<|%Y~|NN`J
z_e(oFcYfQ8f9i+6`|h{AJ^abP|B*05f|Y1LH?^p%*b-SgRjXTEK82wA0FyivD^ipP
z5RJZusSiTQL%B3R)AMTd+gRY}W~<LKbN|7^N)Mwq4**s2hRjPbm~jt4;OKU|pXj%J
zi`(ok>%TGk9pOjc$#`O~I2Qd=KB_}8V>S@%Q7#^UiJl98jV!<p7)T|6Q7BB?(r2kI
zCdL4hbd1UL*m4dq@(Qe7Kyu+CAeoCK8k-~-^d)soEec^Oi#&-C)zq4Su-#Gg(e5la
zVV98%kH;VafI?c53Tk1nndgTM{;m=>TW5py;$#@3W%KNwYs1(!LQlC_lmPgSINUlp
zjS9>z^*PRx=vV6pDQ`<(&Gy|+nY>;B47EY3##c^HL<tM20W>hsq-oZvQj|$|Ocirx
zp&%Eo!lSwBS_CeseIdt^C$H0h;#|4sp3uEO@g#A_;@mu5%@pUML$4yXPrj+ZQ@NRL
z4UdLvwgb_jGfI1a{d-}vvCi2Yp$>Dce&$pN_aA*<_{d*<$y4aoeaarn5MZ8yr}iVi
z_3kg-edC@#`kUYQrP)vY!D~k*F-vvTXGqXc67#&b*2)oJl7V)E-w$s^JJ|~;nbD6E
zb4xcQo0&Y11O)M~8TYh&TsGM++hzoS-gacr<31&OZ2K73k$!v0KE$xt;Cj>R9&=bf
zh|4Juq+Tr^5=;P-3uFsmN0CcVXiO3@5m+2xL_-!GSwIu_htE{FX|#!f&y1jE=;L)V
zrb3(mChb`Ktg9UqstI7U4xtfc{;e_qa}jpgsuKUSXFwF~j*V70M!l1|K`L;i_QL|3
zy1WoR)A?+%BMPC_d2aIUD&LiJQ=vPxC(P`BMws1oT?ivv!bTk(yDI9<cp23Sf|D&m
zSqdd8bZ0~e?TeUZvBpQb{1)D_V42W|W%!oD&1{#Xy}$<cS!;6%iZV>G@v)HGdkC<(
zAr!B^F64IYVBGM87B7VQ(R<0&JZQzP#aVy`LV!Tu+Yvx%fNiFNP07%R9H6{SoBhge
z+Y!cg?FD%8obI{%{oxaT`^8VeTlXn<NJD^m%AHa_pm+Z4?{6JxE|0(CFaGGGOZR<Z
zD;;&2txE&TQm&TtA2iCz;P*fyey`6H+e@}3+Q#Tn^g%|yb8xWfuNha<Ia886dN2l1
z9!=`8!3MpU9w4NBiy)@o7|_U8N3cozs}Gasr2R|z91B^+Z6z;AzY&X0`3$Skkul}<
zECy_bZ^k0Ppn7kfO9B}?1_6vfz&w+<{}DLMJJDzMKJIk%ZnSge!Sqnn0VitN5GDg4
z)CEqpH_GQ|MR+bvieM^d0U5$&6;`~AcGvC8RTyLnizzb#rJB_!WG1GWlR0$2v<ToF
z2#D}6PziuQ!1wF03ObWJ!}Ok;!o;o{LY~)a<Y~$f3F>$=71}2^sfj7*Snc>EqN*Ai
zA9o`L;K_(1U?^l>jaqjDI@aTcyb9!iY-2UlHx^izNCAxF+;&)Ecr-WM5Hd)@jLx9K
zjC)pj@ZL~`eU@1(hy2V`C{T>498Lq)SsU4&F5r^GtH}a{MX3m&c~)i{duSDqtA=Mk
z_=ey8#8dRveM%kH5MZ8Cr}N<l^V*mH%LDI!@0&h4QQzDu;*#jB%Eic1=;SB*YWig|
z);$my0ECp7?=2H^9Uczz5Sp{gQh3dY1HOEsJr4kdz8mm<(mD(fVhGIuWWY<YV$TFX
z5p@OuD0w&0CiXza*mHKg9wsyGHMg5#7Brb<My`&-sK!`TzlX;}Rd#X9p7$az#+YLX
zXgn{I049aU(lUPV=^!KOC4kXB>3{$ktNIs!iD4!h8!kzkg3K~er80`l6@cRenq+s-
zH%2luB5X8Nx1;lEAmhyitib@^DUj6x5h;1JdYUSqt#R-x6y!8$+dKjDY;w;np*XRJ
z@Y*OEX)Hrm%E~(RFg5Za8J>17k%eIwB@1IXZzKW6L!<!l-7QZeFDiPzgwk?|b5bqI
zdX}i4Bo{;BB}F?SORlDP-A%L*fHAgfJDv(mwTF&{6~bqYr6mXfV_|e|3$33i8dW6@
zo)xSD)SJaV;}g~uP%qdvx9v@*{^n1=;VFLWJ|zxs2ry5H)A+d$;^+21>lsH+-2RcR
z1W056Yf5m-Okf~fmuxWQA45VBG(6uSV;Vst+RHQ{*OpUH7o`^5uzRi~0Z{@O2av&<
zn9Bgh@+b~SDQ_door*ziTRp%VNc6kY)O+*+sQ-nVKTqmw4AhO%?R+=P0T^32UU@L8
zcd<l2JAy2k0gR4E;V=(lMX(w0O1##^osvY8a{zKyRzk#-hw+?Da)oq20vU;98mgI!
zu}RU8G^;_J+CW2wNqkr<j)YGWDmyRjFIKA%nX2p)z+l$J6|fMX+5<QsD7Qij0eL)p
zpu){H&O4coP#>KSqdTq(qr0CGGV_Q{!bGd0B+x{@Ws|OxO5}H2coEtib7OrJRYN0)
z?vn2j(9k@__Sc&eDGV#nA-t6(nAa9SY=p-8MJUOH<7hgT+jEd={%c^1T^+U@JYcD)
z`sp)a`EI-#{^nG;iAPhQaI*%r8=r<^Sk4DPql*6)g_y0y<@blo)V~b(zKx{J@bP3F
zfg!*=S<lR)yk9>uvFl~)<@0Zw#6-CGCI8o~RI}3<GE2NNvxzdon&tPxQ~lgbUn<|t
zf8T?N@1tGs5eDM1q%N7pKI<j<6Fqt`_mh}s(Ni!`=@SdwyK+WZx`6F(-0r0pbst#L
z0y<80|4bsBa~$k;o)cn@alo?g1zryVjD@cv)!0BsfTHn>-i?7)1R8yjO^N*YndN2-
zA>9r;QNUg}z-Tv}ks!qttCPtRGV0JwrgEwoYS745@jA~_7J4u3k87MbE0g5+Dymd7
zIfcrk)GU@<jn4%<lA)2hgVo-u9kk_WV4Lwdx(^Tna-p1`3c0PcEZ%)>nA}O8W^@OQ
z)1ZbxW!qd^44a5ib^w|jZIz4CSW}vx6}p#1wZ8mdVrfR`a8xsTd`E?w{6<iy%u%<#
z657z5n~)4TG&u`%)1h?uKq%aFOPGT4JT^N^p(Skz?!Par+<P~w0wn(t$;?cR+Oc*4
z8Ld;4@~{C>;r#i9u+jdxaO~Z`{wUur!__@XBQOM*M`<9hgw_0DY1<EM))#+mGKM*1
zvIQ`j2^;7b;5ecf2$($_paZ*o?1iXg52MYD%f|0yGxhsuH~T>2>rCOVe$FN3Y24Vl
z#}w^u{F}Cuz{xJ$o;TxroZb(f3mr`oN{LI^vF4)}r|r%0+npHBNj6-PL<~ELqKg4c
zs+|--<P^RVuvt#VoCAA*sRyG$asbkSBnFv008ym1o8~_np!CPuLU=J=Xho5FqG4uC
zw>1=(=}xw`iRXd3<WZWLEl?vOrDTPsOtFDgDL;Y@Hgj&FP}V7rMqM+Nce4;0J0>Aa
zc{BQ}@5=1V8qYFyH(_R1m>^Fxx%*imgY16`{j@6EDpNdJrCy~$?TVC;1+uy_I!a`*
zak2VzG_HT=;AvEHqe3f3BZ{*+V(PD2GOvWjCP{zVKzGp{EAA$IcFpx+;s&H?_UthQ
zf%W_ECw%q*1OgRqj+#zd8&64XW{@XE4S}#3o>Bee#W#cp-}6mZ!aewO+s7fmJk8Gd
zk4^6S?;9Iue{3{*FDk-R9!4`^&2-JPk)?=+y)OxqXvN99aIm@j+4Cr5ND}~<$Kw5F
z%=^$BIIcgYFYI#%uNYdh?|ny-Buw(G5_tJOI-$Uf+GSy~bZTw(o0g5QtwpdQ5=nku
z?IK?>g|QsvX)UGIfJ_*Wyb%K!2R`%QBEWd0i}eVM1_r|_{N=3}F+f1W4<jOHteW06
z!V>5iUTS%*M|%eNw6XW5M597VbYs@&0knod=0&v3j-gtNl2`;7&!lM#U2o0Lg4I6p
zS|Fpp0u$wLSOtGM-*_?#r`1?aIT|t^h-$X+YLMlpyXM%g8^bL1H2LXWp;ej-6+(7R
zjD2MtYBHpN4$QAS2Po+sE8mpYVq3;L)R@@$cIKk4t5(*VcV_^PEP$iE>d*|WU0xva
zg_k6`nyHyE0x4kZ)?35eHCLNX-143GgtZ4ABoaqU017ri8{396+1TdBdZ<10!262h
z^Iy^U%p0}h;p0g%0z-g#lAMUgb2ol?eEVP3$``)I8#wAlRXLr)EG{FHG|XhDBmrZf
zAe*SaW=s2ZEy<?ZtUi7>Xf$o29t~dK{<>i`Lmk(32^d8a>0lJMAD2sgW9-5m2%VWX
zX3Gy6Z3n9X+;LI^KQ35u#J`hAw+~A`6RFB{Pqq-5l$Wgort%9yn<(Qiqi_3fMV5Vl
zu`%kg8X^eA4rjKx9-4aIORMdHVnkd~oSk*X_k1RXS%wppsTV@!bUUmppj}1`?dT>9
zF~U@0#3@*1qhtrF_wtZD`L6S7G+@3FwOLPr%?9V3&fB4kERFIwTBgY{I|Z7$8m@y4
z*lhLUG!e6F!^Ga3?Qe(oZ9t_)N6RXPy^3B~4Y?Q17;@5R)1<(nuNsU<A>pMMX$sQ`
z6r|{E3yFP(q5#l|3Ui%y2FszjN!#Qm75-`;aD<TEGoKq~uek{?2HBkZ?*?R*qruap
zW2TZfT_y&09aUGu`svdj4xf6<^B>Q9X}GP=Jpx03d754!-<_R!PoY!!l9GkXRHv=b
zii8h6@o#J-^9%f89)?ZHwX_!@J`w}WS!Dj58Dae}dNRoyNu4IOoWAn{&Vd|H+@5dO
z0j3wKi=h`=_rP+!)9B8@CHvh!#a`E;hk!FJHHsnJM*-j*M?Qw<pk%+*g#ZXrV}?iN
z;E(_&dLq^X#yl7IU<_akNFo}Gs%*I%|31~k7=UQ&DW+rxnh;!JCx=UeUjRlznG&h&
ztmebY*?L&P)i_Bz<^_smS{CXfSQd}R#y&x9OuXwW^Bn*Hiu!xuv2_+ck0i`RjzgIg
z#n>9WZs>PBl+0J#fOQP|dXC06)6~=Kc}AFq<~%a94>t7(;j<RnXXUT~`%E<|qAKT6
zTXuy2WN7RH&~@cx1H(bQMC(+<_M~#{UJZdgwGU|H$yAW+w*pQM6seh!efz?eTW$+;
z*WE&4CXFudzay;E?NVJ^B}JpcB~v13Q$%|1)ceEVz5a{)=Y06;i5!6;z&w$o{{(OQ
zRhjX>Eq2RaB+Dt2(6WkdhXDah%?#9_d3HPKo=5}*x1EVR)guKY@1*C2r2K=Hk5`?H
zqJJMlXAzhjQ2OBFvvPk7{_R7Ae`l*TJMHVWw>q8(Dw00&uvi34jGo|C0J4vS`ZktR
zqJNaX3X`esMg3&-T2y88Rs<;KwYUd!Il#oh>4ne~8dJkEZ>h|4ASNiUuY-|PRML*u
z5ZA+b9`uX37`d9-RGWOrMVia;JV&wQ`3)2r4AH2zw{W7`2qen51HG+-|I3WGSk&?`
zEUFy4ep}<Xk5SBcg7*~&1j^MYbY~-j(UwX;le~!}GdD@(Z?H_Yv^9+FpjXSmo5I+3
z!dS(rP*wxADx?6snmRP+hFT~CUK(3Daqhu5i^0^K4Prz(6RR)Q)DSw$;L+qn4(Ljf
zW|^cvV6zG6updP_V{W?zkA|jb>lHNGjvWnk*lv<{%aJR}sSz6ao$`hA?+ovM-B-j*
zVEE?=9DyOgJb{D%WZ3lgg|T-K82NG~wiRy4DAR8qjMztR1Z2BpzB~lw0AWiFfQ#On
z`H0>_0*Q2tZa@<_`KQU#NI;SDFZ#{_=2C(|Z@IXh6kbZ6P!Cvi9G3xOdaHd@3Pi_M
zv;>K`P~ET{<F}j7XPJGg9!ABM%Bd*PFZ(Xpe;u4YOA`H_bJ1rChs6VrKw`F(p9o;g
zgHh5xf>sZa6QC$T9vk%}&(8<i-6G{<x<x4iV5Bz+Vf9QST&U#2X=G(;R?h^$XgLcx
z@;aW2K@U&c*1j00%fg_Z=(qQ1p523r^PPAR8&U-T@sy*I*CXn*@*JL0k$01$fD^Aq
z`5b=No!k@V_TCWY4m=}dXAXqTW-)A*QH~&_RYzN_Laj?j1)S8z6fN%rWtvvdQjAt3
zH>hDI*JCK8y@}G_)@lG!b-uevq2*?1BimnDzrZ-qPGRTHF#n9_hk4ouY;2ap#gh*r
zErYy2GDul@Ni<Gtt(JeG_Rin>4^M`B?CHD@Lx6cYpZ$33-w{f0r3KbkYPN3ocTC31
zyelu`JfKSx$i3PW+0F<!Rvn)}C%Qfk3VxX-&`91%&j1bpf;^*3J(m7&Z5>+H?0THl
z3xo_k(w0&-=2A~4oo3lFW6cRX%s2}ey5wK7;V?1%EzH)(G1MisNmk=Az7wO&juFeP
zuvzL6W4=MuC^HWuS@wj4F|T4_vcxD$z+->)BR@U^7&ZUWv1qVudu*5;6~J`pB_nan
zJh>QoF&CjNpI*hIz$I}<N~5W~4IYExSsEW(yPpAt>1_30`Cn6LOeqn@p*b=Z?!e>t
zB#CrdewpgDJSq=%$y<#+t7=?WPeZMm0cXrQA+&60jLuk&X8Ymig|RJHlgpVzj;0wl
zh-uZS&L`d#n$VrQDx6e@P&}tDgDGDr%ic;WPk3+iakUOm=gkb%XbE!y82pgQ5A<eo
zG<A7308?ShuCVLY=ZC37SA}!rZ_10SoLm5i@vf~b+}Zr=U%zR1dq1%wFa($<cKn|-
zTmSAb@n`UezF9MNtCx#Kn5mD9ax*bC8;yD2aHSBL3hYt@DCGez^+epDJ0T$DRt5~9
z64ud8N*;)Z#3H@fY-Id>3jXw%X%0<pOjF3s6_ed!yKzOB-V*()c2_4*hcognygH1q
z<ZcYt1Rsv7!I*L|#K%lwUq&3uZH|mF%N!pPz@&PZGyuIi7kMAri5OmsDC>wNs&Aqy
zt9UVjOiW<7$Ea~)5L#6GXV5Ln;bE$W%o54|(@UAKRF>qw1Fn0C3i-L*i;bN2H%<Ty
zONCi{uKq8Uli`B~Xrg>ZW0bC_AkHVz16$)e_0G}Q8z5W3rnRf{E(Eq;{@W%SlP6Wi
z>#5`!$=Q8SooVg7|Cs@H0HlOb0~^tuyMRq|eF?p_WsZ$H7phwGIXoJ`#?=Dc6YStg
zc6oYk#sg|?_{jyR&c((W!kAE;X&+F9$yP*yW+!BT0$n#xFRWUCx4CrYMDwqH<G_>V
z278+B$Pi$jrYHW94*UDE)4$N}Z2XjT%XG7w5{UlsS2JFtG%IYzdkdW<<1Yhdo=d+<
z-@{c72B{FU9|Cj3?(8zp<v7*AXU);Z#9Ej>(ZIn*+i_UX{}YYcKaWN={gn-m$6zbd
zHtoF{Zh7s7{f4h32eua@cMxklg&~{NY+4>eEWEThe+-*>1F{G(ieg2#%|d1oVJ`<5
zt8LLZrC6a=?K}7wHQ8cVjH^IK%_S8*vaxgDtfS>Ye`A-iOqu>Mrxvo|+$!FHVU>?c
zo*#-KXEfoMkMVO8Yj^Ct+922t)ddo~Y?-zg;+m6F;e#;5K0*JOO?fap>#C#C{tHxu
zl|0o&#?LpOac~CO{A;nz4VGIEBcWZK52fwbge})RKNNRf8&<N@VMW9Q(VOu~nzUxF
zK@4aU-tzI4%BX-wqyQzR#Bg-bgyLYWCgj;<N#>VCv~JTQ26ceu#*(!T*nZQq37tJB
zEU#P$n<tOIcH`}T@XF!s{lt#I5MZ9z@qf~7{d>dMHxb!?{aDPu$Qa2Msi;q&<3gDV
zVf6Dd9*vP#l8k;d^l|2FHZ>Z`<Y&ZGMFbf4Fw7%x)QZsKuP+CdL1x-up25M!SJGck
z1SrQc47a~9Nb7~$JT$WfBX$HMzJ(>wAchJnU9wow&(9QkabR)bOV7PTfU);+XSE%H
zgIWH#oya`%{EL1hF!Gkh20G@huwS6JN<@s!YuSG3e(5_$yiNg_Cn?@q&xW%Lxp49#
z&@Z55CE7#}U<9LV+q@UE)e-39fc4sDO9g=6mQ|<1dU9-I0Zp*?Af10NAv4R#unuXO
z8A)P*0;e(boEt<4fv!M-hl)Mr9aW^}j6YS$j^kbI4dVxH4&#TOP0yO`VF^LJs#pgC
zENITnwIwLZ)YhOFFhWeSK#s;#3KYh(^W^z2QR<E2ji;%019{pF<mpk9LA0%0UJKJZ
zwucvf=@&1r9ADh=f#3i6VNJ{vY+;4~^8}xOC*$Tx{9jWJmG@6)a@%AybbHIlNOnY)
zJ9#i(8Lk<z8}{T)m;o?b8x2DQ4*RSJMhQqX1CAlBI9t6OWPDw2$dXs&?o<3-9K>Ex
zsDIh_bS(MZH{zNWTTW>ppQ9xtNOBc3=tC?&jdE|1X*IQu)+0utu#)9c1PI<f+2mvN
zUdN<m62QnKNfa&_MbVBU!1=&LfRX1TRJe@OZQ$anl?I5$8Wz~FKOI7423oMlmI#p@
zJKMrgOZhnizsnh07Q#|4MqWW=%4w7NI4<Tf@tvJcb6k`IAx(l8Lkfce&r&!|d*pYM
zKu7P`6ageZqY$ZTWdwX0dF__x#5CIw^G^8__omD<V;h>3G&rCtKdzt@P@mWyN_(Ff
z#tz>WI&+87MH@x`OzS|<5*C^AG>ug%bgFnYVl|JzaD;KuW{2%cv`LH(OoOh@oa$%O
zqU!-|m?0E2@Mh503G?IA|MSe>zu{j!8Sk&B(>@IW=IL~{zkp-?{xCIK3G07BME&K6
zbO0ohFZ8gDDd++`b&QC#uLLl9xKzIzf_}6pfj|m@IaqkzhCeJP(*uBHM4}fH!D-Or
zNwzu#>1-WtcHFy&-jaVBK`nYl9`uVIjBh>uU;EGrdq9q7VvV<`>YOC|1RE}4$nt8d
zEMsb5quwd}E-}awz-Uzl7!EMzAqimgeGerKVST3}2_Usu_VAjABn2=T%IvcpwZv3T
z3=19WX>6S^rqq<|_*rPnl#3`YG{S-f<-u5o$Ox3rkY~aZ(0a*bsP&9C>A>LULA5UO
z6qIw*B%+FIs!2mADIe1b??c6SMIgb_3YAq2?bJoh$sh5=^G9BkJRA?B`TBiKwf?rK
z$qKcxA9Q*Nl@k8qmTT~8o`u-vHK8&x7na)sl6)B9STg`kdHDkJG^>_VDI%?qXP>Nv
z08fOa0Um`6l@}5f0f1pI3To#}$IaIIxzIR&;RCtC{7<(}zWsx}i*5MgaUFpnz&x(Q
zedTTJRU!Y1dT9OjR0!j;l4{f2lJo}#g&v2!#J}ZiWXIe_#ym<t@ow?@2o^p2+B5pm
zPDP-w5SNUgg^e7z+=KBC9*Mr3pu&JAf{d0<A+%)QwOoI*J`0cWDf`_dUp^Bs*#~dC
z*6M`$QFXg)g%Nf}VIUcK8E*lMCH56I(>}^eNvL`mz+7_R$+O`%DJK(gxep-PBk-*Y
zz<9XKLuZ}<@sOHARgFqE9KX;G7XTWC<BEX3CHRF8{BJ~tp4TlEi1N3{I#g{h?v}h5
z4>t8D2m%5fGlI%iytpzt9^SFB5$+?~Z=j$_hC*trK{+f{<@YL-bUbk^Q_jcFbNW<y
zAq65mTjSC-xlpB8vpuyp6!%^i#*REIH0Sn)^;Q8-%Ka^c18Xabp}ue)a>H84iF7eG
zj+X_Tijr&|q69+SixMS4`I;PhY1ukW&{ofe+L?vF2*s(N4yWIF-<5X<UfFv*1eh!P
z{5>vv|Ee%PL1&UbCS3G2)N}WmT&b2q4-1{hY{a~cK66^LLxLVXda;Z~G<;_KFS`QM
z+my30KuMGS9%C%7Q-9o%h<lQ@<)D;+Xb{-6{<KdrsDp5$&t<2MV1%z39SNJTsrHQ*
z!xlm}W6~%SaEQ8WMqZwb!ela>IzWZNJe1~)vd`kd^b$yMYz78HKx26v8Eb)xRYXQ_
z#&SmjB_BNPdZ&>OXNU-$SOkq?h)fN`q*9;)N3hYnE)-wvd=&Yih0r3nbz@<zCGTVY
zM`KbY9%emdMoQs6(CnRr%FeS>>MKL7y+2JQS?At8R#|xywM?F51Ue66YHT9J%UiXa
zksTzzAQyv|MITI9Y!e~ux;BhJbq<gN%7rPD!|CQm@V@v<_^W*3L})BuwBsm^;9Ikg
z3h6cTMfaRFKFS*cwt2kJ3<aC@>SAawUR()_mDd&~CjM>n=v#f~hmXg61cm_fm=Ez&
zX9M3KMxR>?)i(h&JJlacp$ZGJU_z7x)_m5Tmq-MV5!4kHjbnlv3!g_&NFGMS3P%ck
z7eOVKwfDMerr-6wm>3#!yX-(S5GEV+Tr3P{M%rwUnLc|TUs?S+zoq3>I858$GIH@Z
zjh4bb0A>WrEa?D3Wo%Em0Sk}0=U`rzy><`7KraGJa-m{mF|x?COacrCjeZC%eaH8b
zr^ndC=0)_;$Hl-U&f2>q$uF+u!b3DB6PT--zsQOTAfIOOd5L$j8YvOz=N3XzvR@6=
zER4nar6izWmW{gw&q5vZTT`Z<<*K7&;oX&b_$c&Z1)<FI;7lt;Wa6j_P${RSby;q!
zcNT1HRpsKjv2*|$EcY@+6PL%Oty+$czw(eaGb5oje>fBlJ}Zpxzcy4Sc82B77$LC|
z^)sj_tJU+#qoJ{Q&Oj>5WFSawxhS<|0TS?P%A`tWd5$LELS=1zAvDgNJ^^*^KZn!r
z`Mr2K4F5caBQOM*$8d0;s;mCaQ1~HC%>M__jLV?8M11O`=rmX};`~!&$7i{2%Jnet
zhyasbC*z;q`*I%7cSDsPks7P-2cBBDFOxhM|Eq@)xCYi``xSv~@EkcfDwoj>TQOP<
z8eH85&|E8kfvq$V)s0h_N};kuKNi>|7MaPz7_#{|7yxNJ;@?3VY%0sPe`!FpoX7)0
zXXfN^h-rIo`4|nRjVs$#Y9@=9v$UQI4=zyjh=o?}MTzzj3&?|M31INBrp@yc@7Y6o
zs=ZNuM&pu>*~VCjbk)P~p}ZIsTju4(kZpf6dSlD1Ri0{@v#kPL<z<|{9w!t68;xx|
zKwG!-vH>(T5lG=O594Y3c5W;d3Td6eG_g!TL#(!S<X>u&+d>IN=h1^Vhvv-Auv(Z1
zjr?dB1yH)Q4=95dl?&(iiE3&9A8L8J{A~e5ZNi))WaLO_NmNsNTcC4WWBD8n%N9Of
z%uoEg#shDA&)_vMeEsN-zz|>_-SK?dtnK@9^O(5WFVxzVpFl8#PQMN!o|{QEG(H@f
znWp?q@*d3K%C2a(f2n=#fk_`!JYj6Ms5ioW23aq`Z<hQrl7MgGl6kcoRXzG1jKA|^
z5HKY{&RYq;ynyWZ1a8C>nlYjQn_{x^Fp)`?)Q_!_U*3h*aY@4I5dfyYd6)9Hhljt6
zt{mUXbAc3v#b+6X&EzSleum{#i?UkFhx^XrU8uVx`=%;>)xnem0L1ia4EQc){^PVC
zR->*QMpxd3)S}&q5@EyKIR{Aj8oaI%o<qmdLV_C@?e|m~;ZDA&v7mA>4K*%f={oNP
zI_8D&bmfOsC#B@RPc)oSs5kNwHAvdHYIx+u2$HmM^v3iGFi_Jq^{c6Yay3@i1X%LB
z$kSZ?oY3BRO<3T4r7%Vz>{!TB#MxLlgTlcX>v$<bKo?-_i~uxr#H2daPPH5~(+*HS
z8x)(HYZvJ3yz%GxcJcdLN8h&GyR?Qc9<>n|0?eZ}luyqUy)w+sHnY`N!w&kkX@;O^
z#z(@LrhH;%MKj!FbW9DvX0-yC$U<_P8bc|<2scK9&Cb&3CZI_~02#&vAjwOy_l7ln
z5E?AVj82iQG78E^sBv&G)>B;XVT|-w;ks-=LS?$>H8IX4Q~t$07)iQ#PZ$RX?^G#}
z`5YdM+w$JSJwD%Wdnm-FPh=aZ89|bZ{wvCj@SRbjJtU^illPP8UlWRpKFuF7$x0aV
zTCo&PAY{1%8JHsg<fLZ#?6id!F}%L1KDc?|tMV7HwAgN)gunKwO~U^eAocdm&2W+T
z63J9ZRJ8R9JDe!Y`b@>RiMnp=YP$mZ5aU-w^lsy(1u{8Y%3Lf6asiom1?XHbj!nQ(
zY5}$EXvl0sDd4*2gzlbe!!m5Nb?R&~qf>;z0NV3sLVfW9h=i)M@@23ckVo@&)V5uX
zlTLDzaUyhv%ztI!{0d5*KNlW+*Q@(yWccdQ7=a<cJQ|}oTu1n>Fuk9OjMvhX_a#yz
zmcdkx+QmiOkjrSBHzRK+S$VqxJ*?t|nu%JR{E3*>hz36mbuz?O%i;DsnoEV%ViH;A
zr0*15LI&vPVbbqXxG1HJnCbJ|Spf6vrbfcSb~DUVgCgk|wL`X+$cfQq0Atf*2N**a
z+3vLQz6WC_y@!Yc_B|QDIMa^pYJ3%N!P~T4X`!`mp*?v|tVD)*6UNm8i|w#viFx-*
zq_L)C{|1e@J_Epv;hH?3{oRi{B=AxAteukti;733JLX=WYH3uoY2;#zeYK7drFll#
zu#QLYE}D(q4x>yh0z_MHvz-dRk!YqU%nmT#qeexW4loWWMjEhFt=7teC*K2S8alN|
z@PJzMt|DgJc(I?_ZXK0{CKD4813L42!{~L-4P8{7*GKozRc0h)$EE<}YG~ojv=+~E
zGU<S$P#IOTO$u98+-eCu5m0#MCBg~#Hp{31TsZg9(e~62RPX!iPY!SCM`r|v0Q2aK
z<8Uo;1in3t+*}W}KV+VHgK`-%9@Z0HAi^8#J7eBW%9zNYW3*(O%^oN7BO@aP;TSc-
zF6d%Sk*CWCJOPs%aGwF&>uAdcJ(}KcEmY(CXtpo@-#}3y$p<R5e98DoATSf=VI_^B
zqa*nk&AKH%X+2_;IG49#i5Op#gGfX_3rBf)%lFT$sQ=GcPlHF}Mm_<s<tWI*w9tE0
zUoTU;wh*0%@v6{ykFQ9+A5E}0K4Ot647CEle5?UT6pP_G%+_DPPiHZxD#FyEs~3xM
zXvRv`I9udOVB;WL<D|53M-;Y_lJlpbZ@&%V!@ABNziG<T(Q>@5vd{X9B&Bu9RN_B*
zuI2^Vn98ubN8>Q!W8jeHg`f)i>1oZ@OvTx~A8k+jX8n0sO^X4lvHdmuer8)3z4rMb
zbMV%%G_ozM=K&f*n>i*0wex2~b8&&!91JJ3gBo*JaYUFJF1Bf;Yw}r)1#&YT>TjxR
z)!!Lu7JjgH|C@(O0E3t2KjQTeVEz$<HvGvyJObYtMqbSA{W~<by*h9GKfLNfUX15z
zoPp%RlV0R!IT;xY86k|33pjeH%zx!}ITE!l-3cYeWjE5N0+|8Bo$6!kYojpxvOdH1
zM+#GEKeSK!LKt!bN$!`8m%`R|CCos}62MqU%o@e0w&dXeqm?`5SjJvJc^Sai!T3QZ
zFD(7lc9mdPZKZ{Z7_=^yytd9*@z<!(N?!B(Iw*INzA&drR46B+hkm2#{Bu;}e-gvl
zpg{3js^OneEr*+!!OzO{Gl&f-4-+!WaSUeMwU6?^;-EB@@F(x0>U@cWo`)9vPP~|-
ztka4e%{;pY<1LqE^!+nS{<9)G#~M!?Uq3rKzv+>@oOE8b9WfMjY<}L|8`302ztcZ`
zr)$dk>97q6ZdO=PdumH49lkka4?ibt%pMA>U4Vu#VGfWjpFf5Q^I4n4WKuUw6&cB+
ziQBQZ-nt&E8=<+fcsi_hz9$@g>pL#J(1!26kP#RH%oj3R!)3qV5%|tf`Xb8eev=Zr
zYh}bxX}6-m7S`NCD(0ccc%)hyzunyd7!-ezsdB^Q>0p1PP+9U+>@)KW`rb=&V|rdq
zG${r!X0#k?oK5XXZ@t!Afz&oO@->qa;c7ImX6egPP=gqS$0X^m{g09{l7z7knL9=v
zx{LX2!;;JHwumqBID8E3A%MkGX=`w_ce2n8+7xd9B816cRJa$yw<vLila+2b1-F=j
zelz4Uly#`O5AeH_l=0Uyg>Z1&jxe{l7;ec_!i^-obqE!5EINQOLP)BBSGxw~le9T8
z)P!<1P)@rg&r<9B4Ey$e>Vpt+G0#(BPyt(w#j64umKK#+0-Dx_Ss-8lqbeDJY@ZpJ
z3O((7+vf_Oxsj(wrAyE7owj2_nwpd_h<a))d$epDbM}RUFB8%$kI#e*g`3mYzc^H9
z4~7e{(OR$p^9>rOorMgrzRL2bGAsGdF}DENw!p@qP1^=|5##J4##y<z{PwXh_NvM~
ze|a*Vo#CG^U<8H$^92mnaM6GK2z+lCe<3wAuSR$GW~vs<&?vkj+4wY9O|#x)Xl?!~
zJ7<v5pY=vN?uI!&)1D>3vF`>vnBJZb0#CB%?nwANHojIk$HH;gVC5`cJT?-(l)S|@
zLQTbSz5rY#ekp2jS2-AkU1Cx{au|I7A`%+7^eJ$?eHYK6=fR|%R5;K8CI*nKoP(SQ
zKvI(xS0qj(1C~RVIGiT`a-u<^pWiFgnBmI>_WuFEp+a0p$_qQU9}F{<R@k$7I$WEp
zguSq@ssM%j$T>W;B2?ZHP)F$rEpL#bzARA5lt0nH2xKlW`+s|_5{_H_lecgXm`JY-
zePauiImkGpOy8S_YQ9GTQ8zo$QSzhKCgM3$$)<otAfvx(hG#?pP(P6f8qp(`R-Tp-
zKuHB$V^d|{=wbki4*z-2H6eS$^Fw*tHDMz=Lk?|{9jb**Jen4DF(V4YA*J2m*jrL%
zP~u;Hmxb~GIEVvXH9K3Y{K`bP_-{7vesiCh(z~dJFFxA{3;||%Z9J|c@I7JTxz$kl
zW#+RlBu``E4$W?5h(wQd$r+#JT0q3!U;0R)uQcl&{Py7&w1>moi1iW2m5O7SECGxF
zMTSqaWX-5qJ}-%QK0QDv<y&Br?I#&B0YcK9*J#ROl!>8a-isAr#!aP-rga*-EO{|H
zsJJhdTZ)V^ZCmf<{b;+QB@>RaoCt~jI&MORP?&iXTAB590boFt2MGhM@OeWmj#=MX
zzB`BMRkB_orY&0zgk8D0FnaN!FyCDYyXnDFgh@5d=<dQZ$>_MovSA!lP5BcsyRw`V
zO#(CqY6R9mY5DJ4t?-dn!@^?Pd$Gv`Rp!O;iRWXYhZ;dBCHY-7pbzLiTAl~9V3h%l
zhd1SMDnv;$WIQf;G+M~Ps>afF7Sev1lu*p=X#5L=2Ic*<+TL<tJrB!l`;jnq?F&Mc
zrf7@gXu5>^)S{t!<^*(QxN%G#1T>~_t^AbEzZiF{x5%+(NPulFUp$(t7JjgO?_1yb
zxZX5Z@WzG!a|NHgD|y$yH$Qu%*{=Okty}#%$<HXSqOg=gE*1=nG52JEEQTKqkeRh;
zs0MN4@(|R+#wWQ(cGdSU{oc$vCLcu8)%L7;7s(3c8;Vpse&fV=crJ<lS)Suima_$5
zQdTd<>@0a~vuy{H^e}^J0OO$@9f&cs4M?0hm-0?llVV(MzK$AX@}fGpdljsAB{IoU
zo}^AL;uHY$5bP>cV`%u54`p&Ti>#(XXh<elkW?7i60W5QSo@&|!bE2&j4^W`r_z0%
z9odyFS_x$#3#FMN#SziWWyVDvmiNT^I<Z!#1G1hZLH>3OuDm22tpH5nPJxU(85}J<
zC*_2^e&35PV;;r^%fU5;Arsi^Q<EEtV-y_{3ze$kU)PDeUlA-6&{Dp~weh6(4VdD6
zXgVMUoup;NiZjob<7u$Zt5BR<yN-m+p_@Y+iu3x&JVl%1IHTln&YcJ?>SDSu?XuRw
zRHwlCkMdHrjvVa)I)vA%OKWc%&y0On`JTUm4?TQ*t`Qgl%<%enLPy|L#ksxBPW_iF
zt;#nk6onXwhg1^K#J%%)z0Y*bJjpY#98G$pZwC##f9qi{oBsNj14!iSixw@0ahzK1
zlv(S5X^g9OxV2OaUpF}tb~USEj=n7U68mXh3x-&^mgvn`?qy(KtwLU(DTJn7*BhI%
zE58yCNc&{@3ZaQCt!$cmDWpe8_!IV_*is&hw8f0Xjp0|rF!Nye%|#Yp(~O<h740`x
zQuz!WT^T>%wikSPSo_TVVTw+W+3p4*rDm968?!7cHd(G)3)}F<cEiBRqh8P=MyH}p
z8DIg3MkGf*XB`lG9~JTUm;$q>;_CpGLS;xynipdUfBuS9X8nWo)bVM5WUu28m^TK9
zYwgA~)UX(O_H$CMV}>UKpp5YOh=ngbS0n&yh*-dHYB3b&I&=<1MNW(~*o;9=tgXV2
zRx%TzJ-<JUTzzwB9ehT(SeTP!0mq5r-qM9oS%q~b!Zk5S+jEn)mM7^2qj|i&=IX^x
zeY5>b<C)Tb0BDA6v;J*&$*Up2e3pS3e*E~3!1opBv2ER-->k2_5}!d~cgxWjm{`cl
zsk;eaJTH^NUkN;XW}F`Wu1|-JK{xwsa2a2x1ByaQ3NgtP8k<!H(u|90Vlv@d<|e~U
zcr#n6oyd=|PfV$;RkXlDD)!?0I#$>tQIlmjE#a>(WvG;>1|SSz63D0yMRG9Fdl4!r
zM5g{QEp*4qSne`C?ky)GZ;0<|{5{2T53$Y*%;J~$@uos!BKz9-DRHif{#>}`8P5w-
zgxVVS-2>TTjbcpHqd`xBs90Ily`xxA3;T%uZH0A~;kA<gN~UY<JFKI~v&O*O&C1_T
zSWYU-GQ+5$xv}TH(U$p4K%>{DS;jgNlPs~w97y~e*`8YOn&dg8a}@t7*Tr`_N2=u+
zl~*a?-~)BJtaIEt#)f*^#GtF?(fC&~k4C>`RFIG{?L4-%&apSP?hPY1y(lzy-xSUj
z=9$<4$^dJPu9)rW3fsaDQ-e3ciEXt4(6zv1Baf%kt!;$r%JM^@TKs`<^ld}7+2GAL
zaAqIj<Kaic5qM%o;QuVnlP_=o&}wu2$Ee^qoG8gQ>6cM9gO(=wLI5|`UOl7GmeYZ4
zE}UE)Ghfk&MR0Mi#%Ib2cnqeDQmr*b5YU>m4SOHLODD&|SE7TzlUZzWoJb0Bdl@-v
z4kiyJh0F|LeE)ipIe~(qj!lT;LAa5%M8CEoL#~h%J0YVkiZVKzp$XcqW35+-AsV*|
zZ>hz1O01t?xnq2G5u>RvnY@||hSL2zgBn7QnJ~ZiaCpYmw}#{IeJ>5wC`(k27+Vf;
zOwdt<j?xU9X2w3lI_4PsEt0=iXh`q@BTZ{mP0R%h^*bqOyuh#Zt=A37dx_yP#<4~Y
zqXFQSArsiSi5`rd9b@Xb$C4h|2r%x8_H#8xyXK!fx8wn;K1ty+G2fI6>mtt9;vAS@
zCZ1W{_So}h`p;FQS(AtvtY<{s%mjjr;Tmeq?fC<tc;icvp}9VsXVy@$&TDkqTt`Q`
z24~Lbf?ScPAqdbc=O$7B`I+qca;Pq@|G{K_;(Isleaq0B89b*?j2RhjbvOc#JOaPb
zSnB@I=Ia0aqtH1ro}IpaG(Yuv62>*v&v?kn^`zt|SaG0Pycpx>wZu{k4q_OrA6DzL
zo{ZUKPstxDPat6O*P3Zt2uEI1Tuz45mkZ&6S_6YrAd$^)uy`{5rd;oiiHE)7UwfUt
zk6wv&sPt)itUA?Sc`gdWv5wfqa<D4Zdc!Ee&Ql~Zg%7p7_NU5zukd$;?Qfb{79$L>
zFxZe}OpcsN)~fk8SC+%{HX416P4Yc8A__68o<w76UX}P$rLYmk!Wj(q*%nRBx?}7L
zwIn8w=`1c4(M+(PI~aU)=qy)dVZyjgO$x*S6F}}5-}WO&P<|U+wDS|}^q7+7e6LWb
zuQjVZaSbvH>QV5Rz{AeBPbt#57m?VZK_mh~h71}`oq1p*EHsYcpW^id=MN4Uo{QJb
zxHzZhYarYKAY_?%U8Hz)3xG7g^gsw7{DaVW|L=v}3%7?orD_-*9}mT?hiLbFm}SR8
zlfjptS0E3SSI`ZbcLuUVc6v*g*t7jRHgc6u6^^|8xx;(AKLY(&!EjWDBk&|1fmepf
zZ&(YJ{|GGHVx1bbuNDGQ%K%lBQz;R>8RceDsKrB6T2S6Yti><`B^y6Te%1;HLpFmq
ziG!1VfzuNHk1bQ-MOgh=LP+R965IC~cgOk#^PiH-c4tNJB;tt`cv^TXqJ%XQ^U#@g
z(e%%Z!i0AMiewDH$Oe$Wuk+b?DjJtT7141udv9RmPP4(=*_RV+O9WfEVP@tNDMHC{
zv9PpX%FTS$zj$Rh@#%X*xbJQZan<rG+8w6^t9a3S#;BQC5wm<44YKVR;}KrB2p2U>
zofsIvJ9&T^{okYcwZ^)Qh72gF_|m)w*lkN}55PL|hOqwN%fiSwL9F-F@#Pi^@7Z~C
z@4}P|Y>Ye%mL6$x;1NXkOsyq?ekY)g872)M(w1Qq%`~g2Qtb}gsHy8}%nb=#luNUD
zhw;xfB#;O2WUQzY0Pe8v7VBNljfcwi!=ZT13ql8h&6V6}Xi`Jd*jTpESqAC<yeg15
zevOmo*E9&t(A-=^a;E%K;n;ir-zV|y0Hr`$zx256+7MtKx1k<x=+PK~Zw-^%bJ^C9
zRy)=2<>K9H9*r3+r#pK+ivU3sT^TPA-y{#C-?PL4W-vr^Il%PbCVD4&uUYnOljGrA
zCyQZwc|DAe5`qHptg76MB(rTL{$q$NQIL(59I?*6lGh^7M0psGJGmwphDx8cjZUYe
zKSo=jF!C+O0hm>mflX{chdy$+m&NX2MGL$ykbs|*jxwL6<0@bZ)$&^2-YR2p^^3kN
z?A*0K-2Z294tt8oq%_E)ux;g$%<Cakrq`xEz*_deyxPM+maJz$h218NFIq6;dlA9R
z54ASKIX0+7ySHB^EGC*VpDlJp6&NeNnP$l8+<!&L{1*iE|D!@&lGc$&)RpUIepGbq
zBG~$H4d&W}#y(zc&>|@?d;ZLQ*D?CQjBo-vfU*^MincA57-K|77$YULiK&Sdrz4cN
zRpK`}ek0;Q7MD~KH8sY4Gd~#`+X<afN7LSYEyp_^HfqEv@dP_-1S6YewkepGW}EMu
z2mxLC)zr@}zabod|2I7v*Zh^d4uSvhF&u#_W&~arHZJ^Or~1=>PJunapsO($zq8rx
zlA0j2!t7ra>SCkG*hrJhWxJdTY+-au{PxY3o3#if?&!qafX|;@L(TtcIa$j4%9U`Q
z1r?rR(u#-SzfqkCliD757!%AZ4P(O`h11wG=kA&{)%OaEDeq#`WxW^p#C5hX^p-^D
zH!JMIGW)*D-&OuDa6l*c7ETz3(}PLQn060JwIB-R=&5kjB;TAl@le=xU~joQHems(
zLiBj#xd=4u|E!BZ5cU!uraKDVd6Jr!NxTy(mpTG4x<`sS%ySDA=b|*t1aN|N8%p;w
zNc^7|1Q&htj?n#?OqkgUDf(}k{4R@OR+!CF?@CPZv@jXtspNctze;O~aBWX{TRf%O
zUBGKC0C**Vo&h32C;c(%U(}Ys7-Xz}g5~D{5Ixlry9iiId(6BYo+hBl)6S#76HprG
z_gjGQZ5Qqj*}whs(0bP&hWS&U2?tC0uw&=0Fit+JJG~ng;S9$Kw@m=0P+bQ|ADsy!
zyJo*J9C*<iu9zF^G1}1~z&u6+J6!E!KLURo8V|g_+xfmXiQpCTS0R=1r<xUe2ub={
zZmi79i4;=<9E;04jn#uSddS*q2I})>3(Lf$d?I<d>CDhwu^aBA%VUFBycn$3c~ajA
z$P7#{@B#^wD)6U1hlu3h0wc_T#Z*59CPFoj-^hEBeKs#d{a7%3*=C2@mP}Y=+vi#I
zJYcY(0!tP=ui1H|?Ard>ablSHy&Y?urK@`+ol{6gU%&w7$G09nh^IkDfm#{COrG@_
z2pcU~;31G95G09w1<NvH;+TPd<Cdqvv|~*3VTR=>W5mPhm@ufuJEw3HukaZD_aj{e
zZwjl6Zw$?^%7@wMVi^A!hUIq7oyuBlzhQrqHv(W7B(1l|pb1O_I6Ak=*Ldenfsf9!
zPby52pJ!7I%Lv^`UZzw8T={I2b2iQR&T*VGjKjFRGL0?k*Th3D)S84m+kBQmK(y0k
zjVzm|0CP)bJ#4>tPZ<B(*N2gJzAo%KeOK5=d($2J4~62c!=X8~Gt>(xyyYf2g^Ufq
z&x}lj@tsruJnVP@Vcn<iV;CZP`kwoz<dOftNa<a<TK%Omv(n}geKzGI5>O~~=0cm^
z_a$=9?61fAU60Is7Eh)`5dc23a9KCpGExk$oEi`N%4=a9i`XF-BT%ryME$@RPag70
z{qEZ&g{_sc5Y574?y30lLKUq;Am^XA2$RX^u4BmWB~<m{Mm?;F`6U1)2eOE9Jc}35
z#N&|nHi}UnSJ5P|b0(oedR@tHUh6wmrk|sG*;l^m2UvMA-1!Hu4~G;OBd5^9^^(UY
zfH78?1)Ve+B*8J9(*R&m)i7OE*r-mQoD4>PIa3dJc2}WC(^ZpC5qw6%A5W>1X+ebE
z{noId7BZjL<D0_dUa}}JEoO7iX=gk88=2NPn;mO)8v8oU#*Q)3jeVHn`Ad}MSe&c$
z72oOH*|`w#am-`HFhx0UE3q$+g%yZtSCvR6YijWtOhC?G00*!F5;=$*M)jjb1LQe=
z&z>>PqCr>aKU3d%B;;;<e(3DG&RQj`E-i)H5`oI*I!}DtP6diL-7{AgE3MUVZTq3O
zo%*~6;VHI|<w}Q-;RrlcMj+Sh+)sYur4|lT$V9VKOEzOz%;d+167J?IL^`kxlL3o=
zFC&(gbwSLOS8@HZW3b6iO^hOhNz9+t%A&rS%5hwQ=Yq5n{%<s9SvWDp+_f+w0!Y^(
zRHKb~CBN^Bext%l<zJ)}d@)~Gy1!a^=^fqnUz9TWL#Fu4`ee|>Ym#@8!aV{6g~SMA
zu_C4I_1;-lQYAodIGp~>o#92V_?O{R6I1k>!DC^7^6LAd^%}~F_bc=4X?GP_X7FIT
zgvXR)(GIIPG6M>#9%VXH3(H;g5)d?b9;_T1fAZGQ_`JbIL-prjLq_<utU}LA9$yzG
zU$WBO_|55%xtgbi)iiOCLzpH7lC<$Wm<HU>UYqgTY}RQV4FQ7Mo*W)ZfgK#-^D*{U
zpjA<S8N5VmjX<1BP$_}Uf*5vG(q;*Jbx2}qvLUZ`3N5$Vg*(IM;_;A05#Z`uz9`gp
z?+K?TN5jR{CD_gA7*sb2)m1`nCO_6*UHnGY{419p(bMDIFhutBIL}X;qlA{LvIsLG
zqu3b9i=eP?RM-+TRiihXjrY#+ma~veH9(5_i%4<yh>KZ#8$XMlMTXi2FOSk2rH=R`
z6JxZhFh14@g_*zQR8oFLp)loF1U`IjRr!48>4unx@oIgo$2Mh@WqbtIvyLHuuu}c;
zAMCb2y&S@itg;gu?7g01jVWuQ-Y-qSO;l?E40ZSx_7b${pF&+)QhAkWMq=^7V_^bC
zg1P;B2_+HQ%8VNOO#if`WvQh7B_Du8YN)`FaG-o76KsLDEZ!C)%vct~WLEX{PW6^?
zFL$X#4I@AJXkK)04jb=#dkELi^!Yp1NU>|OputL^PS-3WgvMVz#h%l3pi?Oofr_-#
z$jxN9I!c7T6xJIDWTseV7VWZmLWQ$+#tqeKQ(9novF2Jztne`5Wz{69&jFx=l3CbJ
zNNxJ`?P2+^e=jut?k~eN`Es~+|IRQszbzD}cZTfD?$8~d3)%6hr){KY2r!T4T`*kN
z6Fve(6bp4Xi!Lj8P^d?VVPy?e7a_wU(2<E$++MSPYtb7W4S6R@m&;hW%u1i-C4JwF
zCDS!?&LGj!4P|;LH`UUYd`gG95o3lKQ^0}7@+;PuKGwRJ7o%_upkv8?c`6>RGcuJ@
zK~8_umtirJuYPj~f1rWoX8jvNI&1QNWIR(nh#e=t(VmHD;4mRzQ8HYHmv<OOPxg~H
zS}BLsGpEBf&$-d+NJdc|7P!#%kq}<ND*Q|AlYmJe6X*2gBal<U5C<Dc2*;Rmt#J(c
zK%U>WFy6~}UDUvT|C?zp@K{_7ZwReF{Id{dNaO!Fytb-BVSX<C2?fEt66N4};1fN(
z1^`h;eL&Uvc>r@1it?CpGd$-QKal`tG%+DGnS&ml(fP#V=~8GaI<kD;Jj)g!5@Z0M
zHep4P4R(U3qYr#4TzvEIh1oma6TWypA6|HPU)Vc4%QiCg-DBf#d@RP|N?7L*V6KF-
zG~CC}djxWY!UdCx=gQX1Ri1(?G|O`lTPe<TF+!teh<KW(LlRuZN_5>w0<anq6IxXM
zLB<Ql65$goF2wD*fCsaN!Ro4~2)}897|Qr44CWvqAR&gY&`nI<E4M-`5+ea9#-qYc
z@dUKIhyYy(08LfFv%U?syV9utdOCP!@I>{)S-D<bPO684s01DbI36_9{yI1aSfn3j
ziF_3nvP1#8)cJev4~L$0eaO?TGnNlpfu=yk!)jWe@)4?{5g8Yg4udKu5Q@C74Bsi?
zpC^2l%itLSm={9z7vC7Vul>AF*gtZ?*M;_fha@nS%}oAQQ)C5^48%Y);)d7$jH}p^
zvDf*`qmrC~g4{&VbFH%(HZ+oWCu0;^Y6R5gdxEn!$Np^Pcat1TiF1}$O%3N&{bE{p
zEY^fT+fX7cdaDY5QjSVPx@xF@>Vx6<oBmhWaqctWmCrd8e)P+}<abYf<hPIiBM<Y6
z|Je{=uJ}_pJb=Lw=;ZTvNdrruA`7oLy^AoqF*mhh9>%m1Xhe`nUX25ZL=|N;Fj6cv
zhT%0<69%jZLde@$>2x;9*}R6f^?%kNv945#5ekGWrXR+PH_?#gtw?rX|1e(C6_a#J
zi*;G=dV!A$EBRdN0Q}UHh|h-X>dQmt-wYlfAo3O&yeQh@vua~~DL3Q=raCs0xK~)r
zJP|RjT)S%o@S5QW&)oe$nAx$F3iNHT$?#&RNLScrgk*LeK=ObZ0HgQ(t09pB*MKT^
z!eWHfqHl;}#;@8X=)Y5JyZS457eD!UU6H>ZR$9N+-T3}&c6z*!pZ<LTjb!`tj89go
z$d*UZgn}pMViXV`;jcip38ilxshb)!zyg$HRs4@|rA+Z2t%I<QqnYNMjqy~zCJ#?s
zj6QFDrWhyN!OO|Q0MI#fS$`GZ(lNBM6pp_APr~2+#{UxvC-0$P=IQ$w0?gC*+&?9c
zoHWSo0u+VLO^ld-Qux&~$;q3vygUb<0v!*gDS<2ctGpTaZc5x8BN!e9grwlBSv?)?
zp$m84dnkm%f7tE*@J6%qDvBRlIoL`m_hLF?_H@_DBw+Mc9*c4-7(#77rCdywMxU!g
zr2H;^uufumvDx^p%TE&3E3aWbZMs@24D=Sm^1|>tEHUqrErJEl#l-xJ)d=w!f2a6%
z<M<i2*$%sIzLw@(Fw+2zQUsV1-iSOG>62M*MxKfa6)Br6Ai$<jSyrA6xtJolT_su*
z&vmE6fpG9wo`{R;w>!&~|K3{ruHOLOi}~4~1%5AT9%1z|_Vn<e#!MkJN#|&6lsBpX
z92-rp2=a3!!e%2J^C*BXBEcM<>9o8PPL#a6tf=Hh){s7&iwl{`vZ)K;2Ld)+39!cq
zw<$iia5lX8kACN8zTu{?`2U`Wad|Rqa|kd`hO;o-hevG${@cdI`<c5pRPd*;m^>JH
z4H8;ZfWvx-Xa)<cxM!mfRW$ZC6D3V8dm-<KkV;oYd<uJEls39qy7G2@=%XQg>0h9Z
zubuvydTp)K?0i}Rh)Mvr@j|SEJU+!>Qvsja`AVG9^DHVbROl?`YZPkXPOK@H1E8tD
z{8GLB-hURtUtT^ie;wK<SXEiM2Zg*mx$WMGnsNzT^eB)|@G?sIBCeKfo(`|&nH4tb
z;ndLw!=C5e6l!AR0vPgHN_a0Nz)f`BoD4A@j9v@Osx1_xk@7F(X+I2YT5ASg?Ell5
z@vt>aebu9OBL9&q`lZ&&fBCi0-rf#VH+J*We^?KtvpT=3&ezkc{?+Hl$4&67g3mQ5
z;8o>aa%0raj9AUk9Or90U@^_fL$JY?FEIE86=`aH<Yo+DEKlcsY`S<u@{&ay(WGQM
z<5gZc{gYobcl}%bk%x3e|7-{_SM=!{?))Q-z(`@_lZwRaktF>k8RO(*=ZZz2rT1Du
z<Y41}C3hnIF9}O-0#)Zi>o^RiTi+GJuU)>)T(S7>4UA5kZMa9|I$K7MmQbet;>`$j
zJb&-?Emo6htTdKa-bSDC;dwlS<y?{6{v$u=7XNNhf{UtgF)zjS!!##1=9I!WcvQ)Q
z5kTZEztT_iORRG7_PfKjtM`Ze&N-;OO@5<#5qT|ujOAY<<yoKu05vFHRP9Ty$P#!w
zxh&~*zUwmUpMXg=6{ddr$#O;hMp(b=f3()V<2N7>=d-i3QzJXRzgU`iUpqf~K~+2|
z!-OMd1(q&r+6tS{iZ_}Yr~#lgMy_T8z@O)vtpLRs3^b`vdjYRA0Hs)MYS)tZ;e&3T
zt0%`3^lBSV-kY9Hm5}h6%zpK4vxh(V<-4D^=gBfgkH>xt0p{@-<>59S*AXZdbH7wJ
zv+d%Ny2xGOR|$Nte;K48dsfCh9`!EqPK=s~=8$K>;D_$(2%J3Q-Jx;EBOOR<ZvIy<
z<O<C?#V}T=uke?iu`WX()f8AM48}&RK^C7;pJ$#?OL}SYn(%&>nL97kD?jn|?doTZ
z-Qzj@7Xg_IS-MxEf7aK<=I3HiDc=%-!op>0JDh}L@_@luSpCccA<I6`-f)DZH^4zb
zqC6GPzPKWD9%h)rY~q=z`(~xRL3d1+ph1`9QRr7-%(8L3o|%x_^Jn3<>mS#X{dsKc
z*E&n9zgRo}Td%IKe#vjODm#9Mv;5mqTgz^r?S%O+?&fA+lgm$jiitwGTg0QGtLGYH
zy9$9r1(#cj=&`}Bo8o;Q(aQqiF9C<>?($~TKSmynid#jGSKdo(Gb>!xRmk7*d?{?5
zoxZu+Uc2MVx83}*&ttcr5{o=-`Pom2vo}15&pHC%Pm@Oy_lE=}3WZt2GH!O|U9587
zNnT#q7++hPF?k*!Qj-N%LwgA|fLDDy1R`Re`*Gjw+^^=#8$U8>CHheX05I{+i(Y)_
z75%ZQhS{+Itw7vfZGRPxt6`+ibXsqJY3RKCAKKn4!bo!mT}>QXTwhIqpdi^OV6cZS
zjgxKC+W?1-OJOm~Iv9W`=+_PBNG^Qkum5aVd}uM;|DXSNxV|v0#xqtds*0A5)Pk9X
zEBBxyqP#E^kdNL!cPMl>$jMaKELmRX*@XnKKzZfyP<h=qgpYjNKeYc>!uLOu-MSn9
z?Ehwvzh!!)RBB93g?p;yaHhVF6bw@i0W$y8Cb~=hrT~b@vn)ux!NRUmaCTyc6cSW~
zrEy=QKMoB=zu{dgcfILK7>9q#J`O#Yf6C|$fBTq@z)$__|D6B9@A-)vzu~L@<+r~4
zCEqyvg=}oRH1(~Uy1@Yt8Fopg$e=sCL`e~Q@2^$5A|{Y(UlbZ!2<>-~ak%000T=Dp
zH7m<Mx;8rXVad7}iCFm<)+SFyAu<7-<y;KUl>B#EvOFDelN7eO*lFMU`GAYIhsnA_
zuwv}6nCurxK{IPFcT^2bDzS8inJ2p4KT!r{vepf!K7C)<dLzP(TOev{R&HL72n3=x
zE0HgQDa|kaErl8Y)FR=ZFXGpbtB~RJ)@Fbh-)Hh8ulU38LJ_yF_{V?gES&k-&f<6d
zbZc>>kr{t}vz+;+t@)YhZJoIt^|kE3UR<j@&_wNYgz#7iAkUI)v)0Tp&tr7=n4if&
zlh%2Whcm~xK~H?+%eUNg+Z8`Z|G)tZrDC4yXZQ*`zTbPz>vmU~l~=8-Fa3*Jqjq@h
z{6!+$o8j2G6YI}@<`@6`$3OkIzxeqa(l6K7Kk`G_@gJ&o%CDJ_apy-imp1`RH^P>l
zC0AjZVG3LEcO!IHp*jD=--qzKE=2PAKQg=be&yoAncKt4!v1j#xnxQd-B(?S<yTA}
zOtE>Fq>-m2TCHZa3$1YeVko`XX$k-E15o;NR&ldctQ(klp`jFqjjXIt4PfC%kZSw^
z4x{oqb$5!S_UfmPhU@>?SA_gww1YpsOroCdj*OA|>tq=lug^DL9smRTP5N3?xsMPw
z%C;!9BxEE;6}&bJ8BMhs8H858l^?wJA3lmJXNiBkasP)0_I91uzrsQP%FF9pzqdYJ
z{V!vs&JNMrO}1ZI8bWK<TSqxIdC1Dy5L(mu5W#E<VaBCaRbzDRl`{%o_?`|um@j<H
zhQD}}N8lZQ@!rxK{`X&f<Eww=H$U7iG*6dnYd_nlSFTxJSjbBpvR<u*ojbNom7C@N
z`mF1p_o~l#^RMZYe=V1tei=#nvtoJWOr6&YE64ads@f6dM6mYdZfAvF_WySXlRMsP
z;PQn(Zod2W)yDMpFFignb9@N{uLfjotclXd5>{3*rocqaxm1EKkws)x!a^r|YNarG
z%QuDk;^(_ns#e~p^~iuJ?1ez2QI$ssxd|3pB&5|RM|l?kO-k@<c^BnJQJBn3{r=OT
zS*e;AvxXm)BPSzeWv$clRF<>iH<oMhg40ck^p&e9mZ%aU;V-W!qn;`(H>>LYO#VAR
z-{ZK_mwQ`S_?^FATi#h)?f$!3vxa+D16~+=#vh$bn^4%;%Og~%42!F2hrG^_Qt_H^
z-2H6Tc0Oev!yL?0_S8Q44s7F0rG(*o7Vg5$<!W`$`HL5JUR=1iha2vumDR<abz;+-
zn;W58tA@4J^{}+O7#0>5=<l==st8Kfc%DCZzOsAEj)R}N=VM=xkmC0gw!Dab^WU}B
zT73a?%x#)xGB2-TCO^tt|F_;4%8we2^o@sap6sqJ{q;m;`NdO>%`hd|cNOZfQF&5&
zNNr$)5LCI_di$4u9yyoG&*t}Jb{3JdukVcY9WJl0s&%o|GGPMR@WJ8^NKLA-aRp{k
zmQ7*V)C%O!b;B7(;W@wlljKus;m-f?d*K<)IV$~CuR}!+u1=1pyt_b>TQuJ~Rq$1|
zap=G^5o*Ds*;olJgdM9WGB=S-sWVl-Gfx3yrZfF78t>Fh|7r91#>s=FPNw#onbFL5
zmKd+mf|zzl{{tp%07lKv)Hg_E0Oe)QEpOcQj+LV~K9vV(7$SQrpWP?;*g6~C{Y-9u
ztk!Hj^W^d4n+w&&TUS=swoXq^7beE203Rj6&5Td1oFdQV%{A*KQr_4I8|!PKT&{+V
zjm@yWxfxbhS3OZqNBLYS_kn8<-1N2g9>3%M&-W02x4HO1meupJhr&O4K6;ODc<@sj
zzkJ}9m$d5LAJ{WCesHE!eSV?axUQ7X&mt$&3iZ}Qy#C1ANa_Fm>dm#g|1n$s-R`-@
z4~6Y_5r(+|flsSW7kucU(2OZE^KTUTu#(NnK)DpZJeb&UP1Wn<5H8;FP}ujS&k5bF
z<ER9qSb%oDayc!k;f-`{3IS3BaABG%3H{A_B}|QMLG}g@h8mJQVKZwGz&V;nr)MFw
z{u#UShyR#kyHbDihK*yIH+<JO7LNRIJD>e`iyLd-K3*#9FRCrG@-Cubm_{8+5bK+t
zE@j6xMo;O-5MZ9tr}v3FxNfcc!)x{G&$P<TZDl}0ioESwCp2o6uxLyi`q_`r=^aBQ
zrcDb&SEn8Asncfwl`^2R5f&Gh!|M8as1YJl?{uplp%2U1^JlNF)*2t#I=%P17uHU_
z@rgT1-||QQ>^)bvHp{Co`0lUmc~w7t{F9xRjUBxB_R6W(^!9Hsr6cL&O_hHB;~&)9
zzZDgL8!QagOSX$YAZn~IhNLo~D=7mAR1|LXE}vE{W5parm}|>-JQ$Ar^OuJ4{o6zN
z%*8NEv0+BkTOh;oQ&J7!7deOtYRe3f7w}&XQxn^8YzUeWJ}bhaYH>`eYM<#8sY0!P
z*V7GP(wTh2($P&`|BRmBzVDVJoox5}8;#m?y3Oh~M!Y;eHv7iETE6`k{_$u2%K7CG
zV6L2Vb;ax{K=PV1OBa4n8c&sSnZ{X&M4~!;?(Dg6@xpmagcb`0!ZLU$cqcW0MSshc
zN;rM$OgMh(WXKmvd{5yU-b|D5lKQP^`2xRdsc}~>J5z78bQ3>e0P~Tzefr4Q)c7|u
zeSZ!}x$%*=eBeXJA9(2fci(mQ@~0oT|7+g=v5&su*8I-D`$X&9naASf(Vyh?q>uis
z;`<U`6lqofB7kwh0D7MkTz6TzT%t1=n+%Vq3j)cP8Rz}_iHr0w*$msSKNuE2a4KX0
zaS3!P=OSW)68xM>9s(#-h@n)MX|&c=ld=(d#8k+^AYq@w^HIMPc~qs+*q8os>nnD>
zqV<lmkHxWHDeL^hlb<}wzW&=QWq%*-{Tu?!qdlTe^Yzusjn|x8x(K*n;0T*!ptRQU
zSk9h4i;PM;RPjvIWGgo@LB0i5TL9$z`SS)cD}=$;@mR|1fK7G5V*W}pYaU&%s_%Lx
zY*xxh<16@+4SV+Nzxl|)tG>3fUioApSJ<Fbp;j#9+aLP$yFNcE)<5A~ef<6Rzce?N
zdqp$T`qE~rda$HkzO$p>vFq8_oH?CYub#Q<!$&5z&xhj+YyS-r+$;VGoBFIjMLGHt
zoknvLD)EF8^CBz=WHJ(96!{r1B(J!Z`5&t8cVSGJ6Q(#|1)6}<(z(^JdE#7{yXi<c
zKK3E20TN@(^pr(86SP?Ax=VzXXd_%E=Tf8MeY7+IAi<xasaFPyb^%7&C?3re=M&*k
zHQIXhV{z=mbzT`GFa($@<2+pvd)aPw-$)+i>+|qc71B~FphF+pmF4BIyu4%~Fy#iO
zXQo4*?`N~K)>2qGmoCO|3eaiIsn-TQ=<!|()LM*>Q$tQDZES2Jhy-x-p$EeP?P>EA
zwYB)oa(V5|#D1#fW@|IwX|{8<PV=h6&#IwR+{hFPo$-mO?r5Rdo|>3!mx`I{NG^B&
zhC^4qVP<^%uknn)x^5HNZ64qneelewA6VR2xvf|j5#fu7Ql>dp8oj;M>|B_cnElA!
zJqMO=e(rO3UVr1YFDs07zq-;Y-_~qE<Hbubk$4RF+0&<Yf9UT&96t4#JHpJ=WY~6S
z{#!by8Z#>^=f3jMK8e5Bc&PKA!huf#6EF50j5EXp9M+#67?u|z2AK4*L?iZMOy5^y
zl(F163ZYAP9u51x?j<3+doI*bJDVWnB{58^8Zx~tuPMr^j)rh&v0LT5Er-$h>5!%J
zJqxQWgQi*z?sX3K+zjf>>)E>K%0qy8v`_pMw7wz0TtO%6$-JwLO6xZlR+qoNi3EBc
zrAZ~|MO=OG=&^9$1NT~je#ed-Ve9;sPy{$S%ygwdtPt8&2M-ViBL@S3w9tHNNw416
zP*SQTV?Z83-p}S{Ih-fNwzjrzeO4+kpH|j3!e+TLTCLPaK?<YiHc`jzNMaqMryRu2
z?OVgPty{v>^aRw}QWzN-`DV?x*EefbcB_*u<#Hg>*uu(rLU&L~(V<elrwxFSJNm{d
z`&M1Ypq{O-zU;tPm-FLcceS>OC)Kb%@5Jj!i04ANhNng}?2b=;GK}FlOpmeO)%7qt
zJNEJ;2cG-3y)#??`Cq*EuMr1$^d3O@n;5JYtDQh53R=39ziW~yT&7N*0^=;+h$Z@2
z&RaC=_HTs=63;7l91DkD@ik%U>YZWrp_5_j_&ki;y7@j<o@$(B0hoS^yd~s;21JbR
zHlRo2EH&wTA=ES;AkhTNjOIGG{ngyp&wlOln^qpZBOb2lN*aM7z+6dZ>j~V~&6O%?
zkIW8czOxveDH6Y9n3i#7#{2L#Zn?0$^h&0%7SgIvS0QTa8mzF7fBfTwzLvr*x84%w
z=jcR2ftixeBgFa53?lVj9GS76C|UKiXV$IUiy1)4lqmG0B(v0z@qp%MCT$c(N5`Nu
z*TTi6Whl+$X%v2|porU`H5LF<uQ$Tj=ty|RO*e+^Tejd0(civN<GW@!ci|!)QZ+j}
zGdXVCVSOVso_hZCo*lN#&xS09Qb1KDgy&$F3#ZOp2#2q_c4llWH&dx>T5h9NkwB!k
z7Zy0KZ>)z8e&B;)k@fGP*UGim95Gd06)d9L^|Fbn+46V%%kNt5fTHvBbB$u5NVj82
zHls9B$aetVHv6p&3L!U}jmB|UM}PK<ulcW^l|r2z8+{Mp?(+TdpRvTWU7@GyVe+Ha
zIzZ!SPb<Y_tYgA|l6shp<BOrSz7e)R<En7(9Y+C7;U-=QK&|BjUIHdni01K#asWt!
zoLC!ba(0yEHt~&`{1wQtjZrR=F(w<;PW$T_-v9LkUb@3gKY=4K1ehmq@SjYZt}InD
z`BMHlpvX<2!+!L$_EU+zuifr!A&@jhjl(R0jgvC;VkOD5kl}0t3^f>FUBW4v@vDA9
zaw9F2ch8?a8}7aPt}r?=YR0~V=Tan`q5{AiB8nT-x|HxJsxZqku#rYE>=HDYm_*Tw
znXn919wR4jL`mrqy+{@ox?#um9TsA1ko+GT8ws`T+CF4sJeVf=ofSNsEQbFEz-5Xs
zmCUJXL6nnNCmc7XP#pU!k7yg<qx}+7ZDnOCjE#?)=Oizv0_gxcu^Hp-yB5O!16PHy
zkx{mT=b~m*BZ#`QoeIBIUg_9F$2i`*>9aT;_V3*r=BH=be<V~`r-%x&`jn;2)uCoL
z))%4sO5{?pIJ^w<7zPTaEPw*<<W!iO8+&o9oB8%1`{5t|`);NCZ+`Xn{yPE#mp&S$
z;vMZ~uAXgm(d<<K%!NH&EJk?H0t`7H^qMz{vVn|}{L0l3-jf6j08?7r44d?bnZM~!
zI5tN|%{oP?jJ+!PHCaF!()Qmd(5$^E9WFq812);r>^$0B$Z1LllFyrftwL&3gwxJ<
z8s9wxm?zVn{`l_85MUnP@qSFUR=M*pH3=sYlA%C2TL1(oCs7IkPEkhs;RuR%GnMR-
zmt0ctQ9gA(Tb!Ky&Q7WDZH-Rn*^Op>T2xdk2UPD2<#$b=OPYz^%}<6@#y=|%!C+T0
zL?a{Pp-C--$;4xjTNs$~W+hxWcOfh-Erdfe2W`eL@M+PINjg^RYe-3A;1*6_3?IM!
zQ{l^B{_>C;Q#)Q4)z7dW3O_Yjrc}b)F%O2MG9J+6<fv0(F?M6T--)+StJbX$b75&E
zjE@$>vu?RHOdw9EW08TZF`KSb8etN^83iO%Sh#!lP6H6789sC3I0k(t6i3PJu+2QP
z@hN_9#|^+$0k7HF8OtTfd(!?WOowu_i9;@2ycj<9$xrb8a@f0TN0^(TroA*m5<JiT
zQeOfsdwg`<gfHv$X#jVO=eUs?CfKJIecq*9JT*<LD+b^Kp}@JhDcjD(<k<7?y7TB?
z{?NDmtFL^`|9<r+1}~zYU-@|Z)urctI@4`GSMvP^FeZm1B81upaIWyq@_@Iiq{$QU
zQI>Zlk3;3<V=bgh$ip1|wyVSFp)Fzcj?J*M!1&3-Be!BZC4kY7l*{4#7g^`#+De!q
z{Fvb!ap@8EjO~TFRz_iyO%{tUde7l+ebJZR`?`nKAk{wb`y5_J0xHY_$cA-VU>*~J
zPMaXB39RVS)8!!2Mt$PT{_SH<-gr!I(5LZAhX8Yhoc!+5H!9xpA}rJm?N)au-oybK
zCGG2UG83rTQpJ@iP#sKxcB~bbkueolonke&%2YN#^Vdt7h_-TtRy{j?zS3>Yb+XxO
zy39lyjjHt?uS?93*}El)Sx33N!R(q@DYH}MN7mQYoGL7%O^kkYd?sw&w%Zv?gicyD
ze#<Oe)%V99e9#Q;JYlC(P>-jlXDrM$H#<vMYt?ASNAEuxj<Vb>x85AqmKF);cn=NH
zWR3r1o|29QtuUBM@*V^hdf#B5>h&^4a}s0U3>)m*;>8Q$+0TDr*s^7-vCkG47sJOt
z_K|SkefNahZhLOnP9CTV6Yca#X6y6wVH)GEklaHL9c3nd3tkQ06b5$@BEk5?SZH#L
zO~6cJsoYVX8M=;N+mjb2Pp1xeoIZUj+>Q65{ok^69xrJ;j89I4F$^y`A7j2j9xy8Q
z_{6m3f+hi>ae*M?e1ZL*n3)MP<fzokWex9Y0Z;6}uARuh)We=#Teg1m&IkVd+yB{j
zUG;~5`PxfF(kSZtFukvy>#{N4c~kpqeP#eid4DG7qP!P;ZwuL}IA38kQI<IlH5$8c
z&k2wt7p8C87uIh7SlCq}Tu7YFJXX6N{AP*bG31#>!UC*7d9HbYxm$!a2*|S$)(;h(
zb<dUox$&|8b^J$v<`pxy&AmLktN3-TOm+|FciTp%!es`d(5TSyx=k|-hd|p;>PXCk
zT6*7qRSTUe+pqDr#&;X6U^SoVoIq&uy^Ut){e&|gnt0hyJ-U^%SBQ;xGVbOOV4fsr
zrFH*bPULdkL)=j}!S6i~+KmHPkzI{;c5l0z+p|_{&2l4IJwsMq7uj#CtmX`QCwAj8
z$+Xxzf9Nx>+wOE|Lm1j)lK*I90I&!Li_D-^_^3LSItl9K<pneD>L<UtvK;E<EU3J<
z^{6QF%(>H0iZ6sihYmp(mPb$xU-7c9Hdc~iR5BLQT9T)+1X$dD=V!vHvu8{ku|(y*
zX8sptNXOYO+pO{XrNt$}D!K5XzkR;}&IqA2>t(|7I+6mjWb5go+79-sfXA@1vcl|s
zj@p?GL>EipWc{Qitna<={&4N}H<Hj-M|FB9f9m$|o4@(%=2@r!aQDs~X0X=qp6<E(
zzOa4U*02i$zCs@7J@0y#c`8Nru~-@n3yTZ3Uoyz&&Rz)D-}nqSc>+HSs>WI1BCn$i
zQQ@N>{U{(shsZrU!xret1p-zj)@3Tn>|4GtMjw^&P+6uxaaKT6#`D@~A+j9XdGOdV
zY-29$-+z$vfNBAGlh537N7%k~YuEx<%}xV^&B}q?^!TT~=1acm<!^cKpZ98OYPs-9
zHA?evm&^9c0gyA=#50oQzJRd{3Yo^3QRL(tD5nyeib-qi)LLjQtb}bhUls28)4#Ji
znwUSbyqN}1&W~hH^eZU?7aL{dVm3n_Qb0y*GCb8RaX{7OWD(Pp-r1=$7mwU{({tY$
z4m4)EZNj2#Q<sd;!kT?qRj95XFAi0|Ym=vOG_zyT#7L<sp75eG6Z$$`^(a<9p9}X}
zx{_r9M%Ui^|D0zxk7YZ}ySmNRy};{T-E8qwQ(y7#KX0wKCy80O;&)>RFpu*oY~T4u
z`v_vmTHQjo!@We9u7g~AWV6=ThWmlws^a3hbu?>~u@Kc%Q7+9#{UIana1!sqn5b6B
zC?&1+TkA{0`ntN=30>ID49+w<81K?#Sg+IwEtSIw!b5L)%bUV9Gwv7M_B@IqFN8bp
z__XCm&XC;JEO>^{&8gETk#N}zpZe52NVlv59If!8FM4^9T3?<VN`slaz)JaybFir{
zknEP{bA|%T^^NyqxQmtyx5{t>4TL8FE*<XD`~S|2i2y~0O-46Q?!ac*9I*Ag2LQ&#
zM+j~03OfOdRe(mXrR-Z^UmNV_J{W0gqV<uFek}a#&;A^q#Yxtg;aCdcJ@0<E<p|`3
zHSkR2k(4ls3iVx}7UykmeJd@2=@AKNDL<o!*lh}99l82CTVBt$ygaqdW&ahbJB@dC
z54k4ggC?xLr)(iY$@91B=zz(C!ShlBvQ>xxV}$t5oj(W2RPnU78JKoh*Ij_#%<LAH
zZHK$=xeq|Uh-$%wFn{;}#qwfrb;5%W-GAeFe)ROM&wj~=TAPi(-MUnJSF<$sj1Hil
zC6uLqMn%SmW8bM&k-R=(Ovb29>tBc(7JSmaYj%>FEEi#t6_?O}yYEDpx%o&4+X1wT
zgcLKQcD`)dVvGet0oE^?Greb6M-@OEpWaH47JyNQ8hJ^IdhuOJ68<aeVfEOVnXT8&
zTAN|1KU=s>S2@?G$6_P67;GfPwsd_vV46p(?-_e}Wx5Nrq+w9+KIRtcW_GjWZuas;
zT_z+#TD+#P^6p=5FgZKV$lRar)<1wZ{Z}k=&+OO!{69@n;c>PhSJLK&0P`rHt@6iy
zo32BxgV@LG^7-6#1eT8gBG<M%xxL(d^OZ((G~3WjUpK67Zvlwk@B#zPE-f0No6y1>
zszcxt&EPBy9-CIh*<`vowTy*De4#k&lcKM$qo8KMeBMDCq?4JVxfS6rsJ8EY*L%Wi
zfB%1l<)w=-sOG{a$%m|CxK*<vgWkj_S=`+qO(B;~I8AOv3HFJ}X*0A{jJ3i}6ggU6
zrA$@$i4!LQnR6Id68t2&3jmM-1qNPvV-7HGxb%67?A2)bC@^W&Ibd~!>^fywAfSQS
zB%Gxr{_&Hi$YG4)5fOqT<Tf!qWuYGBfKHw|6@K9tejyw`ag1e32AorP0p}?8v>qx_
z{>2E&gQ~Kx3Uw)Lr+iNphFFJVQ4bdly))hf9ORj#C?L@1<SHp&0Z0^Hd*A!s7mgi!
zh;ZR3$9;@^%mo9=6Qd_Rd%%7xk96Pt_u9H1IQF2OwoLX!n4BO)CZYheZtcfky!owo
zi>0s%2)YsVnLe!xl%}xyq9z22Mzj6G)<ofj=kg;z{m^nbTrZ#|cqDH!#}Cx*xWzTr
zVE@Ic%a;_2(>1^ZhPaZv5bO2Hwsj8F@p1w)yK&!1dOSWej2zh&);P~o6jE~14Y=C4
z5ehph1gHOWo=P0&I^&?iAxI(sp^7uTkdp+UDGN+f;eX-o<6*~_148nw2*p}es9}Ui
zblM+(t%qN+?xGk7>0JbrI3%|J_N{(r*HR4S@qsI{$tm)E&~i?H=s?>G`?&k|0kvP2
z8!7xJ?!K_`j{j?cQCQ4oJNM(|ooF>%N4e}DXm)e=1Cr-=e%&uTs?_pF+2lN_S2qNh
zf5Pe6{P=4yQ=Ko)<wm}$(a1fY3h1i{X>C<c4jE@rTouwP83Tu#+iDNo{`cGuyIFNh
zX?B*X4zyEV+~#3}=&eMT#cw(>km8M*mb4Y?;b9QHwQlVeV&LW_gwP-qvd-+XL%-hl
zzT@rT)&JvH%(z~4a6buYbrk7Z-H5ywy_V6FCt*1eg+}B)Njr?=8=siPa8H^sB`adF
zc!7pwU(TF8Z{Tv|s>7CxC;>2HgeiYv4ZxHKhy&puDk6c51kdD=r7(%Dn^?Y>YSwQ<
zGgxnhEC8ru_#4ckcM$H<nE%<I{aLu<GoPXEW;;~jtIYe-ev3UN0J0In$a=U)M#{(9
z5>Hk$qV0GWN&yOzG&V*fjE_731D7%$3JV-Nek|N^*ByZ9xYgRo!_hJ7ShcORJ#F7?
zq4dBo=lX4z;}yMH3^#?xl$Q~CTdc3Eui8eDDjtM|*)*orFvhw^2y2S`p!1<t%w4>l
zB6U02Iz|*TDD_^QB9qu>dj-Nxs@KU*C@ks(WeaJV7b&mEuO=k+QD82FlXnA(WH6?$
zB@gq#V_~nZT`}7j59`yYMmDjBjs{3NXdS|<O@KK+3K%Kb&;Dl+ywp!~B>oF@*4%sm
z)#VFlLC$HXn?|1&(D<atuT^vT;=Pc35{Fow@95Q7?mu4Uw(kjJtbe41*7{zJt|h;>
zbU%1hKyo>f&GC?2{BkQB*h-%AI^u~IZWUEsVL*6sVd-uE8Ly{xcC^rZ{qoA@ukZY(
zSFgoA82<SOMqmgqU%(~O_{?t;f^YpZjL%orn)zFro$L{Aujvf603(z$jBqpCU1b20
zV#vB7wGnwJE*xl$uG05$Ht0wa!NI=waF>Ba`pWEZ3MZsoi)Tgs{u{XYWJuKO-8y*i
z<i<f;H!RoDp5mrEcGqXZ|N6~e3(vpxrf~Ju*VxTztRi(plrhweYAqP`p>NZQ?Cg~U
zfm>nAw*7=&L;zrhEcTefKzzQqw91VBI92$QVLO!K2>=Eemk2UWm9;^M#>7B3&07<w
z*o=1ISamGcxL*Ish}Z)5&t|;J&j6mvm%Q^`?`GTM;YBZaVb~4mC~sn*Ca|?=3&*Hg
zyidd#rv%_Wx;i)3<~TCul%SuVog)O;1gv0u$&+DQo13fQ{-Y0=<jwBgyP+oUAxt(3
zI8)ohxCopr94LcqMl>3`6zkL&E7TS#-!kGl30Z-<`rS97Nu$Z8Qk+$4RJKAMFo5AV
z&gSxw*tUxL!72oab<P7vl;haciBiW_U_C8p(Os@h)wNju&Mp%#$JBZxEu(W%%CgVr
zs3|(X8n$0|Ff5F|*9s>`aw7nXq<S<s9<I_KjT>AtzK6kw$2Gc}s(#LMM(V+UtQ`-}
zQf@q~oLLU#bLEhm5&O^O>h0|6I=A~m$D;9y*KzOu@XrOBI$MTsc7FW)r^V%C`NiSa
zgWogmCFvL)pdv`|+O8Xm7WjI4vpxxgg0O`StVm6p09_JdJGkd}aKGO|MD-`m|L^bm
z4?Di?wZCb%!tn7qM_>prpT`AK`^c-gkGlW4(hjdIWimU8xmuW{Yn2kmN=W*YQE_Ea
zKDi(V!Nt}eK_<B`erKg`;w(B%%k;4fyxso(wD4ClP`;eyMdSBo9xe2sO}Pi)H;uw<
z0tMYz%-MLocCj87>zm=%f9E%e|4);1d9GJ^r>)80_-cIr?5}Uzz($6bd0m6)^MQSP
z@NSUYS3QS39L7P+GJ3X{%>H(g-BW;z0}R@~7)}E*+g1cdvs&@9c${&>{B!#(8jJXa
z0Ur<V9_Yr#j41~6!K3$+v|kK|4<8H%4;;X|@*Ive&N8bggCnD+eHLND!%A+bBA}@U
z3bR;kSH@hzoATnOW~S`Dia%Afsr<}6ccHe=XG+44Lr@T~jbda4E&>WWR`Znf^f6UW
z5Cc>tw9R1p_db2_v!xuoXum}md}8*gerJj>poJCrjhJQvL5RJ0FnB~W<ct<!wlO4f
z#GK!&7Row5F8QK!#azg)rx+k}!zyo1={W!<OLhLr{l~+Wo39R;9aEut+Iz_eG(bE9
zK<$9rd)C7Gi|l8G+>#k>XyZAKKPwdgc?v8zN-`+FiMqpuwXpNc4)Q~KhEmCC_m6o|
z$ve^ecxUv0CS4plH~KPNnDJt;3)JBx0Zb1_?1t!FaPeZ+1W*^0az7r1O}AOPzWsB;
zc=3MH>jIOE?grHQ&5ilx%EbTu>0kZEH$L-6-|z+XB=WQW^m+^d<{voUYw!DouPN6$
zuPxRt%#VgL*B-SQ0szgT)SJ-kuFuG2l-y8=8rt_{%r?jEL=zTgS!w2EcWb;)VnWqC
zho0H@nWoM5k}-_I4y`=ipSB6z{1zM2EYs^Ebb~0Y(rQ5yK0h5k@X@mv#@oY>e*bq-
zM?zP_y5~W95jI<q1Dk`56AlzcvGp^fZ4lm37_CJTUo0i}c7Pt<p95eN0@(@;SUHe!
zmQ&cr!fUo`+Ghb$%o#-hOJL%mGdGYncCrU<sVzh!U&KK)UA}6J<;<jWXHHX}kqI|l
ze@&PiC(LK|BYAttc=_=}iQwoXdWcfzO>1^9PTej&nLiI}O?59~(~0z;$BC0C!r3!t
zpgK>G=YcPUTEHlPrrtL4BGP<92F-{Yc8WEdj-gFEzlNSRn|8w&C35V#a*yUBlxdM8
ziX`%F80-r3bu}^48{}NdqkP5?6bvxkOY}rOu%L5~2Qo%alD3Ed?hZJpY7gQyB(H27
z+h4!`B<b-R!{jx4!rJjuVX8pW>3BVw$+jQ5LD$ws(1qKi=~%0Y+O<3wND?R)a8B^*
zEtk~|lkD7ukKP-$Jcs@?Bjmlr#?u(ZdnxXUohgC1CJpLB>erPf6Yf#VE3&`+AJ{9q
zYYecXrxNc8fv``y;@_mNW?@nHAhk~2*B*1^-qm}BYi-vR8rFM|S+!nZY-4QKe0C%+
zowZ*{yD|Lx$RjWWn9p%#oPYaI|45}?|JC}1yTkG-T@|YsY7+E{&s+Rh|K!@+j81o*
z*B7WMu3q$_!RDU(?QAzjyg@xK>^Go3vv-<h+bcKl9;5F6F3!_DMC9mZzt<{dPz+n*
zwrs{O5X87-xOpfQ3>Pzd!>2#|AhXgO3GPZbS%KoKu#9caK_+=m9)8lcwJq1N()TLf
ztC2g{cj$U+nQL`3%0?-a1FiWy3@H^}HXsU=VPF+ytck<tF~0I1l9Y)VEjzXdn(2nt
zahp|%;GmZfRLx6^21om2#v>XRA6CMDbiv%ab6eQ8eG7S}S}*dE@(vMk;uhVASm;gL
zjt04lkx`ObN&U@oE!_O<+pPXZbt@WOg#>FD{0AR=z$nl2cs5kGGCmk543T76#MqM;
zV`n2BqvNVS;;ci4&-&b?eTc^;uTu2x&YoT23^g{hRQ8Vo)~YY@5LdrOB(J>H4B49^
z88>OX*j6<^aX%)9@=P6KfC@dS9`UiY>r-BY2b3U@8L?zKI~q=rhuI7JYzlx`e8&Uv
z+K@LaZ8Hxq`FOhrlQWUaPN;2ABei8m=n@`N4ug_@qe^E0u2OeAT=?Xf@c#e#?oi(a
zcodpU)Kt8SYCybG>}+#>{r>XvYq>C;BLk4=o$GptUV5C@_z~M~B|!1;X}Z2*3^`7g
z`hb&eE^Rl@&U$EA*SNO9b*r&eoe#%-aL0JKYVQoWM-~_v`yw{>EAea&|9qAa7y`^^
z8JN?5`ES0u({B9g`J=aoPv5r^&aRQz?m{Q!Le|`e+7vTL3U^qEfX((DU?gd7hFLd6
zycum46}P3|*(}y(vI#QWkUJ>owHr^15=VfsS-0a!G?4zC*^6kitd&%6o!rP~td)l$
zTZAE?+9jO6xEdx#CPSUd_Dl&@l?n%K9dTx#R>o&m^&My57JIkw4n$89-=nId!DCVW
zr9vgWYFjQ+*r=*@9ffB0<5cHQKrNP3j6^F%VO7|~tH~qQ_FngWjWkL2V-b@oj-Sma
zJ-p<B9tS_)<qZ2%2`_lgtzp-;d7s7GY`qV_$s4k9Pr#mXFt$SiF%P}z*h-9zRFU1I
z;mqQtw99SkgI27)Ko>(itvVH!*)liHn827*)jSDx2W&U^>93tr0~Nm*3|Qi>*wte1
z1#oS=L_7~D-zr2Dfxr~RBCS{LcMU#Rzf_V(x08l0tpw5`2DF~!w^6Y$S&c&4%%o|e
z*^Jy`aN3^6)n_NRPa~)(=aG$RFRq2TtM-I*lkg8YUbS1c5sz+}YsL9wM4tNy;k625
zGP!eC$e>N8o-%FneG090)q0uEm!<NyaQ7X{;i1(|*cfXw;Sn>fM62bL8;MhmIN36f
z)?P=5u|acCw0p<K*7sX9Z@)P_(SON!$NNkZ2fto?9ocx@#H-Nu(<e3pH(mC+(gj`Q
zjR5<t<ke=jO+aI(T|p*$%g6uUuO4~!zyIqY#n~_AM|=zc<`IVF<Qsn+eZbM*UU=}X
zaObf`INRPEx`aqFNRW#r8Y2u27?!i>nYv{1+^9suZ}+)<kbuEw=JA%`i<rQ+W#YON
zBQ}&uH;#Q}z+~ICX>s2(vh~}Y7H>vlK`}#Qxt8(?rfuAnN@bY2un?L^r6`0oqL?`r
z-?7VYJXsF+Ts&-Ks)@EqeFtc<zY};wO5$(ZzCD~p)?{X84)0}|iu8*{v7HzjC#*D!
zah|XlzCuemjh$*Fbc~`Z+xlV{%!bE3nFPcMIP{79MKd)CkIg6xY~lwxbb#UDf&In;
z(=nyi!13Txv7Uva5^yEY!3~vfUF)#`D2%AmD6G1j+o?~P!4PVksNN^!bb(r-yYITw
z>T9+EX4~dx0GJu0A*=Y(z*J$!cusqlnVll{TFgT=)?7SyW*~ibEf1#4xez07_x5cl
z0zgAHmX(KI+XByMAoaJn4d9!mzG<UQf;t*d^K!_4us@<ITMA0-J*6MwL_ro5At4KM
z>ijAXQ>3Hl@_nbmmgim@a=XaG+}FShMII-5*C~`Kz;RL`d6-TK`TiQ=ExcI(rbGRS
z*kocGO3!nY{T*wOqv~!oam>afu_h{oRUFCmCK53`H0jr&hR6Qrpy0QU-vm9-_D}s@
z5{oQ85_+fie&aZ>7@j+*o5q?v7Mrl>K2gn>!e#!wtzFlW?lUG5mG&CIk8}^a1*K#^
zEv5If$bG$YJACo55f}o@BU}{uV)w@@<;9&RmqMtFTw@X*S!Q-jUChAf;U#T)5ieFV
z(0(0jo@B6)LSXSWNOyou7|mOV7xAD~H5(p3=*`4@mS)S_g&&os>~8VAhTRNu@gs2A
z^15+U%VLIHjDL)ls>S#04JS{YM3)OG6IEZw#J=6k#<=kf*-SX)2_gVlSc>i9z!+F;
zkld9V%mWWRV3qi*E2|V4mf41k@|4x6$O90IET^h`12dm}+B`eXJU#IpakFP2{Kii+
zZ3h_ttRG$MhfT~d8wtUI*XlLwoG7>A>y~$r43Xin5eD1~HA7<_sg_}eO?-ZYaMve4
zb9>nM_$R~sj@_mqrb12C+N?5TSE1)#!dP3k%!L^|8P&TuxF{dzjJyDQK!m@CJrc+`
zp7=h-SdRgYQIOj<v)8|Ilm+BsT4*eVs8V<_7U1SH0KitX(@vwt5NoJhI8%%?JQx9%
z-l>Ihyq0Vn^g3RvW~w<qBk09d?>j>ifM<mn!ekqF9}UywVeHiVm1gJ8J#ih38k9j0
zXjSlvH^bP}EOcciG@|+vN-bDnBwiL9o}+E{R-^~oTd6CuHucKsNnOSzfpm;UdWh-A
z<S}`U)5ml0zOud1Z}{sTUfjbTMkIK0K=JF^kHMk?HWBk{N?}pEJ~ZwYrgKkOc^VRM
zR=d+Q&kxX~;Tmlz4)bORFb}`Nd!Gye<`D*`n;-vyrLzm6QrI6Rx9$()B8I7kKvl_H
z$r63^*>()`=*Blg=Cf+mkR>DLMlpFSQPd@cdj@W7{lJW=8JuKTQ(h$*d<U*HjZgDB
z`!wE|9;f!3u9pKcz#^2zek-*1jBBnBs|yQw8O$h^lr><{Z?vBNo;$ec$xWt2b)3Pr
znducUyys9${*_n%1_hfnJeX-_@1v&uwGI<$_wJhwT%=|^Nz*We;fz(K_!!$z_{!7K
zahvf^4@l#A=>d#^tslBA6UWHIC&?obEW|KV+;dw#9#TIfnO5#&-VbHEF%m(+&N9X^
z!^GmPfBwesp1=E8c;1U%Vrt4NBwQueBNc#aufED8XSU$sjHym0vd0p0Edh*qM`>1U
zYxi@VX45ig$>3`Otq__1O-&(?$$l7FQJ>qjqiamZs`KLDB!*aN$1FN?NZcrQV1x=M
zf{0y5TQ&2%MCBIET1wS%=xn=w*|}#qiWL932hM~pc_z!l6z>a?7}A8oy=I*Nc|5)^
zMk8h$MFA6&jWN9a9D<{50SwoQvHW0~sn5<VbvxS#xwVU9VTD{qrU)%qIT|a#)q2z*
zEzy$Y(WsUtUMcQo8#Ktf^NC8jZX=M#F}4G;0X6uv5%&bXUorl04>H|vk|Ht?k}9~o
z|2Pp)XH{v><y#ws#_Hs$<N{X3@Ow5ters=a!xs-5fg!+LHY5-H$q&wTs`Z2Q)_5pP
z?IM=WUuIbXm@XkQzrd5It6Q#AH`2RwV@$ahT8;)T8OC@o_km!*rBCKv20f+g2MuEi
z?WE=G_K1dez+viH(6qb}5X_Qf!oj`ULiwiaFo-a`U_T`abU&^gZ(2XPUP#HMXt+^9
z&?;mHQD={$p>^F2H!y?WVwzlc-*XpiwR$+be;<m%gp8o=7A4g$*%yVy<iQA9%sBXK
z1J}6oX(-(2IWT)d*A0Rhw*jNzpZ8#<m#|N#wKpd5r+tu0d5{>QitEvzPFL(RJ}*co
zOe)0PFu^gXrbcY80_>+xeB|zM)3sNHgS%;>Mj@w^my1?lDF&H)erY^=Mlb?OoJ06F
zV*baNQE_r0BW^Qjh3a(dR`cp(&^!MiiwvKRT+UC4h||a}!g!KB-O#~O?#mx^Q~0?H
z`PbVd`Z4Soq$R|FbSFUxe)iRJ<7@-NJw~`Id#*w~%vzYfZg)7og|HyCHY1|oN0}D$
zBCIc5^mqkK0L7ZRlS+{SfRQ#CYzUKv$-^R3oy|C*risl?n1qZ_k@w90s#ZsZxbnC(
zb{1k(C{Lgk*^Q}|Dcx_v#}rP}Q6$EkgPtGP;A3Fk6us8?tvuJ>*z0^mmRe$$u{IG6
z)E_9vx$m?WI36)4b$v`c?~;uS-#lyth5&QfkZ^rIL-~mg#bG03RDjciWRyg>Dw4Za
z<Dz>oCE)$XCiCKVzr8ob`$1#ibFQ9;V!4@Q>9pJ<T$m0pzMQ8(F1=yWPZBsJFDdRw
z+?9u~S^F`;4fEm`zrd)&=2gUyRnLIO?e@2!aGskZ<#nuRj~IKC*P9p=n>%u2)=@cr
z2xir;Ewka^{=L+O5P~!BMPiSdmByMB-Ts<^Y8-4pA9)Pim?~*&qud)M-Q0sAGTXd8
z6$0uHW*?Y+?0mNEux+vS_`{wT6LTZ(9XV=CKTR7O8$(_nDVQp1$V*7+NbvHe>#ry8
zLTC(t5sAUXF-?n0=S@^$2T?zd@qfa{xQjhjloOv~ZbpYJtEU;X!db~Xb0F-q=;C_J
zK*}Q|+O%yN3N87MviItdf!39$T$DOairEKBM1o2D>kJhG79T`pg_N|X*XI)TFgw5U
zd134j6OxavLPusSq61@xkO&mf^DUBN>!58{BSbn{M3a)Fct%aj7%Nk8#<MDR@?oqD
zSk-950O08K%Of<!2o-4RuYq*`dU0@x_ksW?dfQH{@d=8R)by_7fomXM8~uwXP6N}u
zDj-Q-Z2W5~3wVu*5}o5Zm&cd^cv}FYtCWjTm*3al((u(Ijld9KE*q7R(&$%}X^)#3
zoexE3->RG*h5jkY74MzlRw=pXk2I)m7%!VLyP1hk|J5Kk^Z~?wYayxln@5J0M_NbE
za`nS67S0&F;rxx)prqsQ{TiI**)jr7bGss$AX;RFZr~c3Y<>fokP1zwN&v`7L^D@N
z^xu5l)f8E7rOKaK?I^{9c@p1A1TocMJd!aO+<`dT?-L;U2hsP+`eSg({>FVzW9);i
z4;_a3@BJL~le!*MG%*@W^b&f)gV`!Y<HdC|{2gZCcYfy6)>LaN%r1o)r2ws_WgZ4p
zm3^FDYjR-L&^PIm3cGr2ynDwH_o45|YM-2<Yh5V^yq8pDDj*OKO9}h7Xwd?OZ8M=g
zI~h95gupCW?)fqWnB*m>RYI6%)1K(C{SZyqLSr^7^6O5F@Co$8)*d(;!dG>}^bLE$
z>PJt9`Rt74v)qH1=i=V2U+Dq`JP{;fYRgphL+7Q0AJ!es+!!kxe)frgY?!LZ`$fx+
z@MDXOiD{N(bp$Mm#Mxg9muU<YT2=nV0Xum!9@=#x#U<W%|8kGW9`8Jj{binDs^4<5
zg@cWGb49O_5hY)BPsS|F;O)bD>q7gsQ%Z&ZC;#g|&)@Qse{o6sgu%@}-Cqv@=CUgw
zmmB?BBoB!p&xXRdOPmy}vRlb;pB4H|Yg4LdI4)g@+2mx?dVenkm;Nu(l4gPiR@C=C
zdWQaz%{U-j3R2N__AS+=zliaB|2if_wXcy0=Lh3INy*Z#W^R^P)?F8S%ji8f{p1A6
zXk<w4M3(>Xp~K<6d+#<wqFKL+Fk$p!V2Hyje`N+E&PrLzLmFvJWUMY7!^HH`@oBc0
z9`3n3JgjA(_7{tbrw@GPo=1B47!3lJ(I=F{&={~niF00c<Vg67x4bRvJ8&eNf*f%_
zZ2^uP*b4>Q>DOv1R^lm7&Bju++~^rTVv^s$+yTZtPp`M|W9a|wc`vq1fB85979z_U
z5V^EZ0pr(3ouU08vN0CU)Vk!qVAui54b|1?$Q15X_$iHrTj)4u{AJ%Ky2Wt*!SkU-
zt&{*}X_E1)QerIkjJhf<uWP_TEE;%1VdKQSR?&qE`}T(}lZ#A2AfbJc&=9Hs0Kybr
z!02K*%yhPgHJq57_1OW80O+K=pb3fRvlKn(Yu9<AzYi$x#VF_GW7@k;;_JbXvT<TA
z8(+IFtk6>6;(txg%tDlGSD{ELQ=66_bwq7csY3-;oedtr!_F@Lp1kzx9llE=Fa(%^
zYvJ4*esls^?`vBEliVm5k@~^=?QA!cg<rh>!W`{nATFQ!+qZtPM|078??xm}s2}z_
z-RNm^(SM0I#pNJlRo=E94;Pu?HNY9XxGy!Zm(TodW^6a8%}0B$?O5ZE>N_iPlMzuk
z#TT<1Zk+lS%`oq}=WYrT%i-kd)3z?JBJq%qQ-)KH#EbO2hFUw{HwwM|j{B6xHXTiW
zwjVL@5kU0v<Lm!6hBUR`ao@}^*{rp1(0%B&U-29(97uBaz@FWqO7EHXzwbTaBIJk~
zYRf6SCnM;25Y2R*ykNR-QXH`J;k8AB0NU$yV%LI=l>gP(#?SPrePy$1zmk$yrDbg-
zWej1LwRq}WR%K*o#z@Fx3w3NNezm#~KwnlFrE298WV90_OzTcy?~^7UlcM^=EG&oS
z+2t_0e?GK#(UtT5S{Td2CW{0CzYh{NY0^OmjIdXSu~N9S1i&P^a-Olw1B9AvjNs)I
z7tmQ-q`4a;28CTsWYXyR9(i+-N*jQ5F<JD$JtilI^dT`0TyJ0ccb;C^ZN?%g>>LL+
z-fD3h#)k7rmxahKxsB{5H1D>>B&%Z#L6K(E9=yAkkB0y=I9>y<hX6A;Fzx0ZRjP@b
zpF{p)M6Ns2aHCHu2ZIEOiprt@%jcLer6{}KD7{BAX32nmmXqQirvRB>>W*{qvbQ%L
z&ag-Nn9ozDBI1pI{fMCBAeewK?aJUR(6++<i#4aXL09G87A=!>223(Mm1jKj#_*YY
z?zf&Vu_(d}m_kb#deEr$rAYY9>W8etQor}u`Q_X4ef68%4R3$P2f<@7(v=3_5%6YU
z-2J6?zqdwy@Gx&iMkE?^+a8u$l;xeU#IC#QNO<e}{?3XholYyM9KwH+z_H_X4@PqJ
z+JBpU%d)vdg51R3@eFR)K`0)Djy>+Rc`y2f&8*X3H~tA=yyToPp2B1T62fz%J7z+w
zcsyiK%QiBxm{}Na6=up4m9nxxqJwZ}&xtf-(<CBJEc8qEce0Ll-CZZb=#6-5*X|0d
z_dXQJM0r?^JX}unT^(Np0fmr7&(5Q&QYSZ8nxpZX68(TmApN8J#zJ2h>2bBVJ`bh&
zz;tLbewL@R9E`@&z1{TQ`vFCQh=$beN6+V_>qVh5yIvn@(iX3)hfO9luyK-+LS#ls
zPU9Yv|7vHg2~`JxruClXVlx@SlLF0bcT4|58@{@H1cm@JFe1o!&o?EjLTp%!paSlF
z7Iurrff*`h;^v;mYWvT2LHFpvwm>xb(O|^evA5OFHtX&~MIWrvkJ38&Y43+$*j}fd
z%y0T$#(OZl<i1Nfj)#Za{Clr?P&Qdxl+o1~0T392mww4h!k@nWP1q3c^-?9tt71bv
zv#8RJ0z$^F!gzRiBN|=Xzm)gsnbk`U<Z`dA#|Zl@X!1xOUxM$K0ah~fmyD}yPBcg=
zY%~vmS^1Z|<cq_*Xo^+<EIY`e=y;|kC&|x|5FZhU66T3U(r`rv(CRtF(n_;!KXdN=
z3_ki-g*!6-&k-j6O&*MSF0s-)?w`HqbpcOi^1$}6I>y<j&7&J#r{c=EYrE=FssdT7
zb+W|1hq%%sCCfE#<0Rx)?mjI+wlIDD-mvhNyTUG=cj(Bf&oS<_c|D#RF^`ly%m^S;
zML%t7`_2GW5mp=LULiyB8ES+kkREGf!`j)^FxT8m&tm|Edo6{<QnKH`D1uENRN~l1
zfJxWx<qXJ2y00z?g+`$6h0P+^xOSO`%~F0QM?OleD5RZ++8pN-hF(`<o;If<ZZ>>;
zwh{PTHtK0NYDlbaQ}{x&Fz@^)-C)%ocrZX=2TMgn<E_80nRNNnwT$c;leMO=q}QTB
zvV5B_^)$jzWD%JyiO)gpr8H~p&0f<iHqNlp&0sUz_*BtYdfofIjFbUgG+>vV;WYcZ
zlwONr675z-(*9~D>6GIZRok~d`<8J1b=Q*Uhn{M&>~0u|bX~Pyb~>^HNt#^tFS_WR
z8IJjDGXlN%xeS=c#(M)3QRMP583dQX?=%knbg)Qc(wn(zJfeYCVP%`w_SEMJ$M96Y
zmall(mxV*91k~#d?{m*?Y7AryEzc5j0M1VHeU|5ERASXt$Uw24bS!o)sP9ZnAotFp
zvBz|sByY<9dViY8crx%2P(?3CI1#VcEKIT%g<DC|&LoKp>|l~%tfi^uGBGi17d=Xw
zJD3NomFYEk62fHllc+u{ZiLBwyHE<CwuM?6c|<Pt6DL20m|CBE#G~lbH8$v%!?GD9
zV?bW(bSWNz)~{9qqlBUJbkNK!Abc5*)y68F$aJwpkbNYNc9342#xISl-3#%ayX5|e
zpQUR=*Tvw|t`EB&{F(X!S>2BjHsrloWvI;^nQ)Mu8a{>q^LbnY`9fjZ!w(+HF;KA?
zno~M2pL!-kZ1+I2pm8?mWrS{E?b)N~x`0V~@7#Z<mHN|N|I{kD_q(2<_SvF)1O2}}
zN%~%Fx<4)L0fu=DgY_B7_zz5B2adysObDnyF_shS1T<UKtat|Pt?&MhZ#C(9b?FqS
zh*74dP98mx#}t9WGY2;NHREkY&EY!D>J4iY3y(<aofkcrHwf)cRlW3?PO;B+{VA%j
z%&A6Q`4N+{*Hbfbtz+P6-c<xUt=k1-bO>z<DW}5szVh2_{}n}1v7+~K(Y|?OIiJkO
z^BvWcZ6DIpR`2<&1V~AfEjcQ_z35cLne-rFCS%zItR%xCfNGH^YEk3Tpod9&gwWDF
z75&;oT8}(%{z)%PZ3aX$Hz7{pR0IYKk;R0)kyzMgH51FBOIAC6A(XaHhp=Zhl-tNH
ziy$DtQ;4b;c4Rztu8r+QxQmXY6@*T+V*pH%&x{==Sr|Z>K`${$8HG)jIlU5!ri$c~
zH`6Y#ypfHB4~7vS($mj!e-h-^u+>-ToQn?Kn+Q3%BoR!UnKnp&PDFyZTqL9TB&z>3
zrkd^>jhVa`9Ys{{Mw@QO=V4l&wu=lyWCN!j1=vxE3FZ_QQ+%D7SdLyG85vBQq<X^E
zSSvowb6>zdv&-I$-oxGVz5(|Mn$Ok;V-x+~$SN4=GcGf5V-4cKWkHq{v+pg^n*m0)
zSUS#LoJqSeu)?&p%Rh>i&-PpQyZ(t~6%9a_u*cVa<(G#){*ylqchFP5z(A=5fbuR@
zVQ!rrV<$<wNgdz@ci-|cX-FR7R&)ki|BK$}Ma+wq@{(~KJd8B9?x77Na0lETKbvmL
zQc$uh)|`0?q5`+Vw|wK*hWCB&@55W({$3<<HjI`lh2tWjQ(ZIAp`2Q42HO2J1FbmJ
z=^C*KjJ}J^G56Sfziit*FHM(%54^L?Qg}?ta$Jj&1~O?GJS1I{0A6lxEEKm>(SOfI
z7_o|e<?6U%6`I00HtRJK)uJ);TPu{*wVXsweLv!fw(~UEBoA}X@i6(kLv*>^71r;#
zA3eJ<U(BRvv>)+Wj>j$8f2x8)4cVXxiFA_e51S$Yph2brPmuutuW>r7uAfD>j*weM
zRsZsgsjpHPQUH{dC#-0KDL_Q(vOGpnmD6>k^JZ~tzdi;-W$B8s^KXacSI*_3vh<nf
zyL{PH1LKl0B1b6SrIvNHE@)#DQrqcFem-qy`1j!>Fa(&vF^SVz@$M~VZC-+mg;x|C
zAK#Lp1|{~dr`x5OV3Y$%X_wxrAI`z`uX;u^Cbg4Qj2-{tEz_Ij_udceb3bqL@RREX
z^>W1MzqePH8TKT1(_33Kl$U^!jINA3|FjO(#QeK|`)|T4zwZY_nGn*xeTakdFHeG3
z$=f2inN-l3X72sHjmDK(b&nTx3EJpWfmJlm5mX$=<IzRWAf>4LgPX_#@t6m4XM-WN
z-XLgwY&(VUeEkkE){cO`Kl9W74n^R*5%|1lYRjV|699@cv$D!FnYW<4%^;A9D3O5P
zJzG6`c{1A>-+QlliJ)ZTYXfZu>rS16tv*+aX@zlg-AQ*%9#v+NNzqmk{kfBh?b>q2
zxUy^6CoSp?QveJ_!*wk<H_T~M_B!sz2*rgL?x!AxRuz*+c88Oh+rzE^XxLb$BG`B#
z*+fh0fhkrPp~Z#5rj&?GFza7u4-v4-04iB%%4h|I@eH6$h^(+w2?bOiI+hf3f`RW3
zBWWJJUh+PIb&n{Ly!uF_7!2_~>~83dVgG_po}pcVid@(|tQm$cV_;>JHc9!*>juCD
zaHfjkYS}EMKI6m3XBmMZzzhrw_a{y}7maS(c=M{jN^NG_%m6)}qG*xpMp+U%g`tzj
z5bw3mdBxczDbA|wu1?dhID52p4<z^P`*dTZzitK}o|u1xEnki?aS=O)Xsmi3*=5W2
zm^bOT61XIaujQ4R;rVpNeAV}UPx!CD@@h}=Q~BPaABYEDFf5t@D#T#1n|>C?jx7Gq
z15$QS{@3hWf7a=CFB;!yaDDQBDYRTZ0{t1Q=&;FNN5)n<UfT?YEqMwu__hrPQ~gih
zU3Fl0___c16XE~-;hzj|fBW0RjsN;zL0euGn}}nB{2+LdEli$LG=k|o4enby%-%6Z
zEJ^3py;47Z<<ryt7{Ex2%!)ykFQUgiOebB7sIY|Mma))4<BL4B8A1aB&HC{M1SoaR
z%P8v1SrbQ%*KtJal&rTC#j>NYDLd!ZLVamHjPIQfop~D6AsaIajb94Gy>`TwE%&k2
zeNmLej(P$?UoTCJB4L9;t~dzi_?<ZaV|X!>C<auHFNXZS*-%r3r$S`pZcL0xEJWdq
zx?URK`2OlM0dX%Z9IuI9$lA_X|2pwY$pL!s^0K>d;QDeVndiLBX_oh*>o?}Vlz;Ni
z{CY4T5(8xT_^cx^1ek%LX|=i++svvIxA$3}&z=(9)ZW|dDu&9^j3-({yQIya^bMn5
z(RPLV1%3&bBybrBWA&+0k8slBiqZ<>5|^{B(hn2qwNKsb&))o|i~mpf<x@GEHz&hp
zhSA~FmiFdcaS3BsF&^Xr8nsIJ;UD<EaN_8F;Z1LRQ~1_be4~XFO(I5y!}6oLAvHez
z{qTvi@&PjFgNK|z#$)jgxXDO7+<-XiZjiI$`<;L@!iphk#0x()X`9hN_QA@Hw|#9d
z9sHWL&G6MPdujO3|KZ<;-}&9&3*Y=r-w^if+R6gTLB-Q+7meP?2>RF!&OiD`nf4_b
zLyd25B9Zn!j(@!7Z2aT-%F5DOPRwOk^a}`2p@=3~c@&5vpFilunwTU%sS_p(W3z6w
zbq;$Ti*_{ux_do&h1u4j6rWuQ(>Lu21?2lHi!~DY<3?e><c#?ZV8^ZqfO0F<<&{u6
zIPSTaHMVPc63NGOXnI#51eXux)2pFGSJf)IWY%BFawtlodtTBCb@^Q)RP>8jeIFfr
zoTZQsC51z6j|`Y101jMZmxRh9Soc8Ou5}NY8OcKVHc_a%)P6O|!B~;0<+cVdx=Y4m
z_@>`;b~qM{K)aJUpP>@jD!AnibLr%$eNhg9sh211xt=;P29_>d4`b*$OGdFjTXJOa
z`>q!PGQ)PMv3EO`jEU~l0S_iwlz5gLP%b@XeWRUjdYfHcc3vO8%>*1hTXCt)@TOL_
zzi!nol}eAvch!JM5|T0a)nERl@S0!yweZ0Yeki=`%U)u27p_JU$pwo2JI>g{J&?X}
zuyJrC_U-ioNzn-RewV^uaXsd7q@5o`%rpf3@pRCLhSUJ32eoDZ6NAqHLI!MLuw|;G
zO8nj5`)y&KP}Ip2$51iY&8*u)sh*c`gLv5)N;?|2)`!F1%v$3SkKI3u9+W;eymGBI
zeb=?(Rn4^H^`#<Dt0Kj?@0bpqX*`b&-QaOQ43}V%v4RFgkY;cVOzgsR;7>cO&ux1m
zD3Em^N3!|g*)V-8d6=vBgvuxH3G*cGU8vI&{SKoFGl`CDZYp(Tg|NP|7P<#_g&ci{
zT2PTian3Y7^du3?2{q?CN4{vY6~<`h)lfBn<zu7(5%YIS%*Q-l0+(J7qv#=C9v<sY
zsxCVXe%0CkZ5#(YnBKLQYGVvs<iV(QgNlt9Pa(L~N5xR5_e&o>KFbIUJ(x=dCcoN2
zZX?s7X_5-5kmZ(vQ7?L*QF%PxW5Mx;9GIH+?T~<|ceg&`&<`$q`2v1(45&7mq4C+8
zhdUk)FzFWWf!rmt`To9K0&Ii(-oNks=8He+*NP-y=w&E;DLas8<k+`7fc%p``6FSC
zUMVV0Gee-Uvt)9dO%8&Kede=$2e4%OFE`Nr?e@J6-$_4=K6s|w&?TQn5KG3Thx$FQ
zNJ~4*&2NeXU~#?X$vFVX$khSPZ}|GJ(lpod3^IUreONfhX~Ah*y&3f24)OJ+bvPa%
zbVsTW3ihsdXr6OPtFc_56ih^^)%x^%y`LE+4>KNGXDX&E=HufAQ8}>+-lnZolO>a=
zk9&HFo23t3dv6N){`2=y4@2%`iY}LDv!CX45t0KitoxDSxnfG5eAh%n>r^+;PIZG0
zptX=kUAYU$bn3dV)X7r2kfDPn;+LD%&_1~s#t!3EanD)a!*VYgQ-M*RG3mJ^O;)r>
zpAnfPc|JF=TspCHb#V8F-E)aWt!aLuAuw{FcMsL~0zCsc8+@C549*1OT{HZXMqmgq
zgV#c?ut1{<`aIPNZ5j#*Xxj9W&nOvbd4#5GJefY1Qn`a>ErmhS)%oy?IoUvyrtcYz
z0Yhq2`(!=_4eUS&ZNMP@LpM`2zr7<&`GiaJ0o+`d?84x-ym{Isk2vw|=O-ksAxg4S
zOEo4<X7gj?2pOVNCV*jjZ=ppG$;5He5L}KSGJ`#UxpcE>e%;GU^zkHF&`Zw7Bi`Z(
zD<0uX4+T0V_%=K=+7p-5d;7hNs=_7YK3e6Nst|(<GbD|#H~~mT-B#%q*jGOA=mU)J
zSh}8)2bcCK0nA_y#;y{Dw^U2R{*r|uiLUB;LS+0eGc&?OyX;^k+P4g|)Qqj+m}cZk
z^z#~k96WAMu1CUw0zjPCQH83`tcBJRD#`nHpm#=_<8^*de<Yb#KRllxm{*_=M0#Li
zRv^b;rbo>5uH6PO`3idq$Owc*-<EvL6h)U8(LD<{v+fycc(hIt_yqP`B+>%QiIFI^
zX8t;LzYj18X!>LQ@Dtnzk$&FAGGzlEj0240MH=hN>ub2UZBn5*rv0`XhAopH@n-W6
zhX6A$I9vXI_WlIS)+{XxL|5z~_KrE?%sJLb4HOhI6i^~6BHDmdUwTkmY#ULkZMo83
zwI96B*v>X8;B14~t&iAf+bz-{icBJ4F_uLQSvhB(%rnP{IrbR$ec$i<)>{9LI60JA
zbv4i4nQ`{s|Nj3qukZcl^%Z|~|LO1hOD|4a(?=PtuJbs;<{%XhSG5xw?n5UYg;2T<
zSjqz4S8N%XITT0fM(K)MZIGUzviTt+P^Ma%k;qTeE$dOfeg+k<2%WHwn<bOKQohKZ
z)5b<M!ogB5`MUtG4pj>NkkWfNU}mQUS83D&mdge{6KB}4)C@~lE7g;8<)KD0;><BE
zZQ{Jm0&tu;qj`%V;#}z7z$x<<3q4+~>SCWcUh)Vd@uPi-w=zmm42Kgw=N__e#tgm$
zHs&$SMrs=ZoBoiP^2GS~ILA#DjS7%r%G9@1DG{cYU}Q(Y<)n4XlCNc8M~*S&*Mp%S
zYO+orF@k!&wgnZ2!=n0mvN6a6jQj`B-A)_7?9HkBrVDBRXZF%L9KI&CWUN!4>nwm_
zOnPwh--nuP<DrKk#q<#ke{7Nmr4sakF)?dUlU=`hH!bWS$?PgX58xeYO67Fpy^=5t
z#N1QF$EwWvSzcsk0ln=bn_@{@KHheMWk}OakYT_jW2d?VJGd(|38s!?NI1rG{<`M~
z%mHR*Xj+|<pIsP_AEiP^mIWClp(>-Hk<OLekU^mgVw^acm@V*df*tf(RfKfn#MvQc
zmU>i@jc#ndf-=?DUlw3y=iZ<p8}Ypb8Oy}NJ9=h`s@ar6M`}`8fXU85u}11BV23Z`
zj_vDd$Aczt+W1r*&F*^6EH?(%<O*wzKa<6S=&xEcGZ;RRm9C1UR-0lL1zf5jaU)Sw
zj0|g2f2>~}&b{ms{&@aSR~tRci^yg}(;a;dLRu{(Eg>VWDpp7P6785?A8BHEk(@Pc
zLHMr=N#nTpL|QxKnq+Vw&#b4rtz{-Y9v<KjwRM{Zi43tK_|bq=fihgv0Hvj2I~^i@
z|Hc=5T3UVg!)f>DK9IIf5CS5G2RDat$W!^7q>8jL3rlH#_ih>^gFtK9lV9@a2G}uL
zu*FcN)|()848SG9d0_F4m(l>LH>Q(Wq}mZ3V-^4&LZ<VJ(01BHO^Ttq%-xy^vDLbh
zJ#6MRd!=FICrs-QH}>CD(zO&r35te(1XjYx4~+(GNl@oahU4v170oZ?5tswa%wp(F
zhyP-@(EjQZq(Q;z!D53ur=q^apK8+>Cu~GCZ%Zy{N1hU$U2)OQB`FR>E1mbBu@YxN
zC2sjdt_Oje0i@vSilH|MbM-*|Os&#uaI3Z%Gifg<Wn;sQsvw3S{q(%i(D|!?sNGT9
z8=4(vYWVz$pRzH=3bn7Hsyscm`F5kw)eYuk;76i_@M0};>Bh!NrQ3T3`(}Omy)N6k
zU-Jee^x-&=Ati$mYGM(Y#jYd_@1GGyv#bl0#C3=LfX^cp%jNUwu!kE!w-v3g=9@i{
zL5}hy<MYBfuOq-xjj}<OM;E@+7=QC4&!h)N6P$AShV;bZPnzw?oi?1;(ldIJk|xxK
z9RdRI>-a7LDsk#*o2g@P;8=oMOsRIzc1lRAkP2=h-e%=9A4vn8#S*-TBr|+zrW~@p
zm-CZRyWjw04LB{m&aQ@9O>E=ojt4aGtNT>V1*rSe!h1{ns%4Fsp2=u}EU)v|y+&XT
zFf$`F9SnbH+Jcc3u>xs=V`ImCc%Z^{nbrszAZbN)fKmRZkXmG+4duy<nrWsE!<Vzr
z<s}xX6Kw6DijZ4TLBMC8kNZxsZgF32MKU7Wcx6J)N=~=U0M?_Q>!*V_MZ3&1B7;k)
z*NnDBI+kVy##xtZ4fPi;XOn;26SwO*jfz@ymbYH?L>^E+7g1BA6%Hh&8HLNg_e>Sg
zEcbY+`Hdf%{mt#nz#_RuOiQ*2sjWQjnD0U%NG+DTOu`K9DU@PxeXS4W%@S0XciqM!
zGc}ncWPyh5WqWvJNjO=b)~+VQ-=kjR=t;<z5tx~fx_@+YJ1sqYF)ci>o{nBX;!Nxn
z)9`VA-Q_uCIGL;LG|fYt5wm*bl8#bN9opuYKoU|7+SNf8=w*0kojiU$4X)l!BN+R|
zh|%r>>B+p*X!pS0qxwt?kDYU)=zCE600r$2X9CWeqlVX5wZ=kPnSCe7gdK)C=@!`L
zg#kza^U8sIPFpwcy%y%b%mHR*4Ltn+`s0s3@z4L#vxriAbUNCHD8C9Wg#iT+&zaek
z=Z({~ge(l-GC|{}CvmCL=;yMHnA{6fW@xGIEf`XW{tT|<(#!QATF!l*QDI0HS%sDF
zQE1zY>Z&O68eF^AY~3Z1GW~e1j&JlqbjzJ>!Y(BoFxXJ5y_*FPoY-YL>|^MJnc!|h
zU`41Mbp-l4H0)&@d_pqw*ccYoP}HgXdM-GJGax*(er&{=;Gwy%?{k72mof+KAStgd
zAC!>F%KEWm1{{`6n&wTqNMdVlwbL~aW+bPaUV~$b<|%qML6Br8-9i?EDeR2l?MUfd
zP6KX&P!l<~QjwX;D(&v1{b#PHtuOlYwDitL(#}&)q?O*1#yZ+t-0N*-xjuTx<Z!Te
z0H+AFg#ZIp7u~t3dZQBoCH`fo@z!sl{3maw>BDPjKwBq+8kyftfz2Ec9<WAxA9IRX
z{0}|41grs;l3-;mYpmxyYra`#nKZg1X9ox06Py=<*+dc%om2pZI`fC)2C(7jmM5*}
z<}h;a5tswa8DoQE&41u{yzu@7B=&33a}3-{n*o^c&8ND|j*2A#G<%DOWjYzL^U;H5
zo+dRxZbZ5Y1tWp3+7cVQ(L5LBYqA;}$4m2yo7{h+j6k!z3B6uzu$eXxEV)&^VZLRn
z;z+qNnams74Aez^Xt6V{rl|p0@DQ>|4WvafY!cz-WU-z|b&`Ln#xc}FG2$66gOJZi
zY=Va1<ozLs>U^il^ca`u=PB$s6WD!usN8#l)U3o+OHTq+f`H1PeZ(^pkPuRm?6@t5
zImxzgto7x!G<xi&vK4DaxG<sCq1uWEl}+Gk-EM<Ox&_N<F9C&ERnIn_haIGU4EJmv
zqYQvK_Ze?YH~yc0kF+tS&JxJ^*lWHy#@rjgcpgHw0F~TuA6{sy>uJ)%{^7YfkXm5x
zl&WkIM+0oa&U5tC?R4QwKQkRb0_)Sxf}DV~!_XenNaODbVl76^08YSiD*mWqnzBUQ
zlx!PPjk@9t6W(=fHGqsR_fTM@KK{Bbp6EGJruEUu!qqcK$o!Izz#L%CSPCbD!*3sT
zR^H!2s*4jCLt8k~bpbh<sFY<=z7u-Ya8pT&(aLt3VVqOpmTcp_B6nf|RYiC*ckP79
zvB(k0?yV51L9)(3wG2S<8|lF*HI)6>Xm6-Z&J@3zpb$eD9%GIEs9fF=>I|<dB$87s
zegiU1r_VAP_sw?H2%Er-;8PPD-CEDZO!?71?~-A;x6!>e^v%IC#_6;Qj<wHt`dQWG
zRi^<u^;^uaz1NVhsmO$%C=ikin??9^SZ5s05^OD3&ZR;7I!-V#VA9~o5fTP-NHI(q
z)9oOzSZXq=e#xa1v^0pec3FuYgSQugkZ*qY>2&1~s<Ag;P6On;9D(^_kZp`hfRv3j
z%AgGgQpXBF=J3vLS~>^E07Uhl974^+ykt1tl5OvTd0T-KP5YTUY4OHh8m(;Tc{YUx
z{*ry%Y&T|uu_4upk_1zvO--NrP_NOL0vlNkeI>2f_a?vyG<lKQ3IKW+=@B~kv-5P)
zjW>V8KfF=R?EK@*2+RTIj3x2lfBr{4^u+gl>rW&1&=*Vxchi&sV@!4k;<5;@n1wdQ
zV{S^@<l!9^OPNXY^m1Zn2@GG!t89i6pAZz!_Qb55QyVLD4U$fLx8$u<2v^p>X)`(D
zb~TsNmC{&(r^+w`{Mle>oO5}V4P0NfR(W{>UTb4OP2T?gcrKDd?RenKaW1wwvhl4N
zvx?Y9R%XMR2E}(ds#tm&Ko*>D)Ed)dwD<HY6KMOX?jdWn6GEHARWe2gsKwaWjfD_K
zJ~fP^83+U(Qs?m9qtM%9*f=(Demr+L)0|K&VW&ih+d2|?aD1HBme<R|@J`!Mo9a_8
zbxfKGFL5TPym56q4PQix{Rg&E7rDs}o;gmNba#&99N1Q_d!H}9F*QsNr$8QVTk^|=
z)n1zH;@If{%7v@91v65W)=x`2chm60&!yItrF7J@tkE*8LsH?<1VxJBvD6Wa=?)K4
z*T#CT)<Eos0F`R~!@i|K#yDwy<oQS@g_)F@N!!lt=}r#-Fb7cCAvF@zbWrbo_^j!l
zUn?Up2bg=ThUMwepT?=;|7<!$B=TS-O?%+_u%M?9<*6zow<o)T*r+C%%c({D@+l(Z
zz*7^B*=FQtM1X}XdWl4f=o{H8yt0e^yrB^RYJ6>ev2J?DsIV~XQ*Y05E&d?i9cB5$
zEH-RT6%@K1z#FG>%Xl<_4&SU|-4k?JQPHJ~24*gWSYw1l){5%B=#z+VjR;}q(G9gU
zVaQdh@A9aMAQ}LAH5r8J@zCg~n3#bww9!uMb3q{kw5su84-tzVwyAuR3e-I0zSe_d
zb8n>3=E|n2T7ziY2)2v<IT+%hUYg=;7E3<CH9~0{00V;0!_#vALruogHQZL7BJP<@
z9Bt+^z5P+zd+hmi{w)u{4)bU_c>FN{3=ee_N?4nM!7?#W+<qU&8MKa(H1p^PiBO@I
z>%m5Jg#83PQ)H^9onv*1m>87bfA5>6N^2pok@hx5LD3c@TM48sNh?&L#f$~G_}<F4
zMoqs-H)@?Qwy4N7cAj%gsy4LW>`@$ZkH3lGz?nPklS9;xWSnSccQX26AHezBJw{*-
zF!vak2fpeLe(zJ?`@ekPWVG;}1)N{r!g<_Oz;Xdv>KGVMebbsP*;Euf%!zy5`1C@C
zwP#6Bf)3;bAILGIP^vg=Bd<netV$zHQGKqdNxvIRa-*9jI%MKvG(WyOf$?ACl)|hT
z^2#zf^M1qz4kD8~svm@2Wrxef)`2VtyK|Yw{W_6ie`h<HskIReBLp#q4j!BW5gKRC
zMO1mM5aBX2F)~i7g9DDsGxL?7Yff(-US75kjJVT@thd|7QH`-#Rke8c6fpTY^~u#U
zQJTBW8np3?rOPL$%|JwxW-R+J-GBR6w$eTfis``YhSm~O;n8S+I6_V`&O^l`<<wf$
z8Z(A~9fLO+M557SFgyIZ&rX}~xRP#t-!D4P05vjBdr@Sk3bX^w^pZLW0EO>Ix3^RO
z4Ua;Fhto0lnRv`pUX)U3uwOz<%)-?@rSM-mk7EIN+yl;tSp*<44kjcRC7xu`Ojn<U
z?L~WoQd*3^)|0hgSR-b{*NqxyW6!2a2;))}h#?gYQsYd(RE~KPX&b6B_5pN&AU8gN
znr}GxKl8-TKb$!NbAWk?h0z)Aef^-l_JPI0UblS&jtdKX+FFLe6A}Jh{Am+10Mof-
z5()tfzq?zmxM`;lPVq!~f-u_}x22v5rCk*9&;~KWAi5m)OXZQx)_#si=}ZvVv%BZi
zQSEotNp)jwgRX5#B9769=mXp6G^9|Jx>NjYF)T$uaN-(7n&YP5QJebgN~IVmr~Ot)
z1=4Xr>}(Knf_y=RGS+4pX{s2#(`t@aaxZnJ_p^2hpS3Yr^D4lrif<V9&Cl5Wygxny
zHt+tQ5q6VstceD^A>Ao9NW_qRlS`vL&$Yp?=U3Bs12)5(bRTBMQ34X~fjB^bnI7)~
zjNo?5jPugk8Ixq3i`8oD&8aiqS31k->c_99D?0~i4MCAFu7dz0^-Blo`)R)+$QX9g
zjo8$nF&jNd{yTSeQhEcfAgceQyU)y)sK$sllSuZV6zknRNaLq&rNuX$Psa$Nf$ZZp
zBnMU>*Q{35P?m*hfMROf!Wfyuie4dERq9!FIgX%^>pk7UVcd5O#lmEOpwUCLWgmNu
z$51Rjb?LYNgAaLA=5O~HfjPju#L{@=zx?K3y!rq9nQz1Kw7<Ut!{!1KIga6VL%&b@
z=kxef8viZ3vd72hv_~fs*)`M_b`b8MtJ=W6=IsU}vbH0Q_(*8SE^)ZB*6J3gc>GGB
zsmsY5Fy9`<`xbM;#<I_dcrHcJCAM<)Z5SrJLqXe}<Bs9132_9T=AjOXQ}rdcU&N_W
zV~Jin(JY1Qp1PPq*9*9mJ4`jiG&XhLe3WfK^9Y^o+MKblZiA8inH9uq%`z<QNv7IC
zw(VJXa}`Xpb|`ba&#%;l-q+fp;Dl+6R4zGi12&yw!fPxOe?6?%7IKnJTl-ieAg{tB
zy7bcZmtYQS5WTqIMy$3}%!AawR0BIxVcmo`R}VSnZrw_UFCaPPn=YrtM=zv<4;`k>
z<u%&}sw7Usv2$o_&Ei?8$tDLd;2+?CF`NY2LRphT1T$kFb!hLvJxnIK4k_uy4?mSw
zKJS4vLQ4PRlO807u5zw*=miIlHhRQSg3%GkbZV$PJ5}e5Bx;g){qoh}dyV_64I2!s
zE$S!F+8Hw!A&vTY?~V77foB(P%6HKojO%#toih_Z|9Zv<%mL<Qmc_-d_(Ol-`G58&
zzIZq|{QTbGZ4>F+=deN7uraX^!8MA2Xn|Xdf(Vx^mrb%{w9`3lBL$*GVW>BCkMBp@
z&t90F4?nX%%m}N>W{F(fNrLLZgi(~9iAv7k=Vywd?)zS)hb^eI%kDHu#QHf)ZbdNd
zuKXBjZ;)h-Q87MP_iRA(z(luAJro4GYNdm81IO@%3Y>`95epz)yp?|nDzZw?S`cOB
z6Cf)PXnNflL5&!k5^v;sJ4px@b0kK=37mZ~+dz%unJnTyj%hfID8Ntv;{lc$2UCv`
zXn5M?1Z%a6tjePW9Eir44XNQEW$@HZ7+1PjvqQKK_j<6QC~O_`1qocLURPO`xy2`R
zjj5BCadyk@<5$!2yB?JtX8-3um^R%YPJ_PGb$r#He@tu%f3qGU_>XsXQ}^;^_%9<R
zKXR2ZX$D$inl!X&1r7>}FYKn)9mLM8tfe7bf|-%qPP`=UDW`^G2nWW3Ev-%)R~>|=
zuQsy;Q+dego36NKz^`a4QgXp_4En_QN#a9N(DrnYR$$y;PH<8fjo^K^j`TPCf2lDx
z^Dk$Pz#L#+!NTZIhF^Bn?)}iw;r{0#kD$4~g0q_9PoxJsLkn4v{8SSdVJ#-uLQM5r
zC2cqzo~5t_JYq-ay`Yam&qxrspGctb6Pp#PeCLH5*i-7QKqd$!%klE+A(7gIUGs~2
zl3liaY}-|8oFLIgvEwkR0yBQ=Oj$<VEgN%CdPbsK$yQd);b#et^AHX7#<^LeRK(vy
z?M8y5`@-Ph0HZRDxSKvFez%5N(f&b_L;ygAOfjb!(Rf$w{1HeJ{R*z$I$*OHnSFiy
ztQQ4X_1&{32u$pxUOuT~lMC6DdWQ>P5r>YA-*6E@i%+K|&55r`47Cmf0~idCkTPc7
zflbYe9<$H3PVyQh_<@i!Ips0qVURXv{T+{_>)oG4eTWrfOo})rPvIZ)qw%AP4q3LL
z0oc2FJFPu%Iki`ok(C)4GiY2__GL;D%=iZ&+CEIJ8_1XT)=O!M(=eI(zXw2KJdARZ
z;a<kTNIOb!p*wWgUuYxs#cp;N|A*lP`s(&l=PpdGN$6!DCyx@~9;G^wY54)bZAdRC
zH&f@}#WdL4L(pit^Tyx(-SZ@ud4BKxVGb~_V300+^`F_j`+a}nOLoT#-+y?p`zzWV
z@b%>Mu=u7jG`h`&X)PRHv}nWVkSTkR<yu^KU2)#qMr>*|OAvP>5JugLzTC<Z_v(8|
zM92G8z`;RJn@wVq*F)mxwAD9XJtRT8GNE;H7?5HcHfr;CM+DpC20Z5JRZ1y<6u@M)
z)@q=pOSF|`w8F?~v?0bP=zCGA#V<x<biY(31Q&yUd&U7qU}xhYRgxqecQ}agj2?L(
zL$*=t!rtbkM(GVHCI6O(#`^8k5@S?kCToL_j=RuSQV!~rRHOFtjbqKju{ho!#602{
zfiTm37}2plv-F2P6JU-J(XSX7NdRu|(MR_u6agn*8OV~AT7|vk&f_npDL`QL(aY)i
zbNzI@dkjypH4D&`TC7qN##c0uQT^NdyYS2$q&9*mffm?5Be+uoAP^IygZ^Nv=?<7&
z^F~_yWe=r&VAB9$jSKLVrsFc4m7yL3c+&bwX@=*}=s+9;Zn-h=;5Q!=gA~a;&aBky
z!d8atFZge)Bau-LgH`d<YG)7l`$Aeics?CndnP^e#PjL#oBeZt`#*c&KYRZVy!Zn?
zG4r>3jKCaVUg6T%{K`K8i)8xZ8{hvY|H1a|;cr==zK~X12o_t~#5qhWP)G2@Oa2G0
zM>cIEhK0y-<5de=DhMaQy6gmB4`q^5sZb0Pf@^2#F?of<EJVu)l-HNvq^ZYs(~+C=
zKzWt6T63hja;{IDq60<-w^D->$)@eWx?57`WJ2jF(v2EWRblFiMq`{*Xe0<I(Fdos
zBBn*OvHJC!jcJYPahp9VIIenKo^!soplx-Z0h|3(=G9>`07<2WwJf0dRCQj49#Uz=
zUKMMi^exn;iglrQDm#ilgYTe&rjB$EvJ=E>oG?!o0OlCaY;1MY!8u6Uhb|%VS$bI^
zxg>Lv?HwN2itxEA2xRl3_K|k;U1}#r)x&0TXD=N+eJic}vNxt4f-d)d<}Qrjt3HQr
z3v#25Y)DG&V^T-wXsYCwE9Wjkih+vk0O!_Ff??n#*8_lJ8A)b$9>11)&tFd`k9N`+
z_L|WIhXLa61nr(OqcZ>a9CsbH2mvtkO49;xb*vV8uoAX5LdO8bn*lP~C<tOwTcn{`
zgc@-PImcE`p2sl)AIFKF&!i82;HmV1k6cf;4{&Fv|9||sFIxSshaP_OAO3Iu(FbvM
z==^oo2+RTIRj!LmU;f9w=J?0|%6Hz}-T%v@@y*ZN*uud}%OLiP*jz{<VaKFXbmszh
zI0*HNIA!iL+iXDuP344pE4n9}i|vv8%77p)oOHBOwZw)pB=AS=43+$<>1ws<f_Bz`
z7q%ihR?Ro0XmbroQL+<l>z?km|1AUA0@My>>~++V(?YQ*n^b1y8gP_yEr`_#U6fv`
zb7F5jC|*_;FJa|bTMBwPsMmx&OIp={dm__lOf6Bwc=5La(lIVkZ|Rd7Qh{MexdC_<
zWzVv_3`tEsTuus=vd725%q2FK=A=RO6JxX#ErO~D2Z+^xfqG#BE9=T~np_>gSWQWV
z<3xa=Jq27o4cm+>Vfq*!cL8WjMhsnu#~!G;jCA#Tk3XMQKL4$0<y~(|2Os#i>7sJ<
z`RD}KqmIzA0Bw|XGl80{BX{MM%U2W+!?OlxyOH7!Ilg$9+Y;1M%eVK^*4}PfTRoRf
zmMPhcWZ-8}yqz{sewM}>phD?K_8E-{)vh*<w%r0KeH;P8RD+ol3)R{jRH8K`J-8@$
zaJ)d;dodke`&fGRV;@T&{lK&7v5z08J3~lZc<%hh!P@yt=YP++N0$DJpZ?}A``e>Q
z|2u!hpZwnEW981@Z3N~3^9ihk{ulq=AHrkr-uYMG@`s*z>iPdqul3x8b_d>NJ)9!Q
zQv!7Yre&gy2=w5r%>~yutcbwYUYsS|L988TjPzuZB(l>W_1Z|GBJkoWHa-(tLo;or
zpyTF+R@4jJq~iTVX$)W#jK-sR8|@5IWNxGu72R}AWg%G#3OZ}(m;>BwJm?MNRpy=s
zECl$J9tRWz-FD2*qoS5+jIE7!pG#PITmsjE3fW>PWxF?<_sgchq(2D_BuE-hBU-t2
zfer#xjfl%tB!q6$;!n;*ggR9Y1dg!gIc5V={4y*Ki_!48-wNJ9`K0Y!TOgbkyh)OY
z)cT`#TH8dR@B;Go%N@leN|{6;#js^#1YCAkR}uH&$0yedE6<5BM5-6-qmR=p?>>oB
zD38Z!>m3iLojy-8ML9fmlHCxEucdfCXV+*?+27d*xWVQDcV&n~FyKQ~rv#YhNYKUe
zz5PL2ymli!xCldkZ(kXhTg(TDIuIa}B^ZL2q%zHeuWug#eb*TJT(JZC;}~JA&;~|(
z>Ua*EVmrb?Y=ga{w0r$_y7kPn>G6*|pPqPfH|^|j%ofsx^-<c|T1XEd)4=%$7Sqbs
z-b2f4-ESWD*1q}U-|^eNb8^`IBk%Y<-!V_&Z(FHJqx{<Z?NyAxQ$O&x+PALX{QAe9
zeEQ2i_S}n?KYC;Lt$Ur-%M&hoZDz4ovK@;!Guv$?mPp6%B#nAt+YGK!5J7V;+yg5`
zmh_CsX8yP4`Yk?yb%i8Fm3*jTNVsgXp&a5Iwa*OdnE*iW-N>0pC{P3RV1#VBq*^#0
zY5}2V3_nx1aJS?He24a27{!PT?Ow?ohB!Q5y2Yd*$PRBx7Y+rEo@O}%T!T%uebDZ7
zcW@Ba7%GneR2}d&LRv;Xsb}~sj;G>z35JZMv~&qlo)pm(P*<FwIY#?M*6`dE<?tHR
z`!L<M(g=0Ze(Qg%m(E8x`U=lFO;!I{hGB?A3F`{+5dCSjI<#(R=m0P+xU-V1w{RF$
z3*VVn5>8Z<D|jv=^`)9W)B!0Db+Mf-qym;!5c(M00}}!sQKbuQ%u{PTnXZh-)3xz<
z(w>Z`OOwe-59KbR{8h|IA0A>j(tFw&k8r;CaJ)7gPWr>)Xl*o_Y@*>8I^EubV?@AD
zF|D2BQTnZ&2h;D|-b%e)*cX_r6$j|GVegnCOZd&*JE_05l{U^@fSuk`;!6T>cPs}b
zN!T25P3a^Jkf-nJ(lEX8PyM&4z1B@1{Ud)bJ^Df~EibKNaX>ooHOjTdY|9)9eI__U
zI(YGLJDvNiccuR7TADm}D=pl<n?}btLf{y#!N`$zfT;QHqnoMq8I$zre~oyU^T=!t
z2Z0V%1Q^)`02~wr=nLzj?bC6CuGj7iAx}FlZ9m4~1lgQ1rt+|xLUJ2nj3#Ic#twDE
z%7ZKedwavQyFE&GZ{sr@5SU1viKJK87SqONJ6*WcN}K25O1-t5dMhh96%_8)aKBzy
z>|>*Bq@(r~#L6sv=h0#JfBXP#K$5?-um3ix)$i+9g9*N`N8&Z;{D1uo-*ovG_D_D_
z^le}8`xg4k&_G~il1f5Pth4DQ%airZAz!Lh2QLu}2p0}8Omn6<1P2-8?Zt-crp&F8
zWUEM1;u!cCr``gaIvfse?H%quH5eRR9gmLpCX><8bUZkiPQb2BCkH2!@!rW8$G%QR
zhuD3Gt;y)1yKr*2xR8z(TlUv&EgUYkTl>H8ZQnvRc>em2a0D)W_3vu!9vnY#d^CDE
zEv|k^TD$P&i)))-xOBL?_}li!=}+8RNayY_u#rR=_zcO062$f)2s<Q#OAkIGaa&1Q
zv<UrxH{rRlGZJQ<QDOYdbHnX){`bExt-t53>4m@k&(p#WK9wHWxB>zXHJ3o2E0Pid
z)u9xb@Re#hxc=Te*h!s-E~TwEy&;`k+ey<GZ>7o6A=1YHV6Zd~#)Gtfe2{J}T}^{G
zK~=R17h%Sp=|u%{8=MWepDyi9;YC0<ZFA)o8%5V?&($Vv0@}~`7L&xSDcm%k(#<}T
zFGF!%I7X}hI5*C<(z)|^$I%C?t9|^z#|;O$wc*j$f{QhDc5o<#jSGMW#hx@61B5Uf
zjxRrUyfgmw@A&$^_Vj;*%jHwnq`8XhH5}^i`D=gv`n$jLr#A-=e_Cfsj~y5JZGZAq
zKJGe04AX{~Sg#<lP^0iW+1hOT;N}cfg8_`GXsC1d1cxpxgl@;|KS~kEvM%%{<7J5c
zo72JJ)ye4i#lhfkXLNLMZ8A8%i;Rc6NJ@2pBvE(LG5)60yXnvUuDh?{u*{446pX-)
zfBH8`{;#J0>Q9c+(#8X6<AGm4+}P~2Fuk{~T~Bv+Z=`dGil?Y1nVAw5C@hWc#~|h|
zDw`HD>*QQz=c~^A;7!7&B+<j?-N&9w>tFEp)O$P5hxu2J`KojKqSS|!-i!KBwj&;s
z4C1`EeK&2r@sYH^?8@{egXb7S04SMs07%{865?jI((_L~pKh;Wi4vnIz_|GiW|}kx
zCq$t}N1L0->vkA4n(;nxQfc#@2?KDPft^kpr(rUq0RuPTcz~pvU8bYy<22BfPFh~Y
zG0r$sXK5Mk+&DU*i({Ornj^EQ^fX`|6?;RCgq$Df8jW%=ww;Cjv^eg*4Hl#y`_Nzc
zx-a~U-}{~L>ApW-bAWk`=e=|3(O+?w`WP-^R}WYuy9HqyFUUUw6@f|E3S7^{<x2k1
zfPxtUoST#m+ym^SZ+9&e%OzM5e?tF_k9H?Y>vWd8o%M$&%j>=AczCWg8eYOt1P`AK
z26v`|qZ{e?_)a=Hx|R+PmH;y2^r!ymPWmHX?c&mFJV5i}K6xYX+rPaBozth`?+X!f
zzd_CAF^K1N5XV~(MIQxW%!h+fk%)E4++Gvd9h9lqlekt!pWIf{AyPHv%L2=C1*cr@
zJPkVxq>j~3dnnyp0XXo)M`pXwTI6E1=gfFx-zmQ@L9MiRw3~*9NWX$(lu!DQ_K!%o
z=}PS>{DBHfla+K~`AXXSv|bu*9w{f82tHzC+WjRY(||V|{%D`+0HoS<ccvo%|D=|r
zBLE!(czlEK&On0LAfWMgf{_2tdv%{}PH#8-&A9(qruL(O6xDuam7@(2eV|4dHl4*O
z0!4>=kL<U)e;+shnor)O|HobO9AI9<`FZ!>`q?iZ9S=X92f<Ro&ljbbaG%t7s<-Te
zNiAH%bRct`wH}h{-;pRolI#Eu8}?fmo5O8Y0+N8njJ#nqSN3tp@7M^B!qM{zTsp4|
zTixwZuXQl)b`QpjOWP;C<y)z@{49Xxg>-cAZ2F)7*;RyjUx&V84^(52{@ib!1I}x>
zR$kMhzUI&NL2)m`$>eRw4Ed`u@$XI3qlXs;G_AGMZA72nRwO*!i#?hj5smUBBVneh
zY>IoJ_c0GMTcP=tF+%uYM3w7o34nC-*&8@Uc`x-YZKj1wOKI>tBHp1c(lU0BGV@Ne
z<)Z|#Sjb&~M;}D_@D5V{Kl(swEh81o9<!8FM51aL`ON?lJvbt4byw2mPlKbvgSS$D
z1t0=IN;vOk0Zug(XBZO$BM2BNf(byBAj5u8w4ut2n@RRa+jn|J7u6na9R9IxZ>k#a
z&M(?{bfKmMX!cF}kJd!rs1unf(-cQhGZOOv0hxQ(_tTx-i(m4izwg()>sS4yfBj3Z
z>74&3QrH||Uc-s%bQga23)|aimr?!>43R~Q9$uyv<65P4*|ss4vH9Qox^HZNP>I>5
zH15$kL2-qVwB4i`ahT35YT3(az@!*{^kW1TvH4)COqamaLZs(C49Y&)8I0SvkiiPd
z@zw$Ihrq%*L`2+ZCyhtj=@0!A1Wdq9Xo93nlgS}K2T`Ut@^doYO=BqIC*y5=r)$t4
z{oOw};~)JR4)Xn6)K`7uG6*H;^5Qw@bGE?BKae`THz8v2Enwe29rXGQX|#WNVYIup
z0E4#qdUVqM;cmK(gpyP}(eQA>!(A9Ctn*a(N3#g5*-I-%G))g?YfZ$JYt=pQ_*v{d
z)Jp)E_T9UPhq;ckVa}!HH(yM9&pv~cDyAAULQYP|3Ak3G?E}7+5Kpsv=T1s*yqsE$
zhk=@kHjNf?kF^n4IYk_bPKP~$X9f}uEaBk4MMxJsERDcv0j9thHWx0;Y9eit?!Bhy
z0PHi1%#U`vn@P0I_n)=M7&Gg$YY2>ule2*K8^@2i-uTW8>xkW<_Z_#x3C@d2up>>5
z4$=V*Dm>nwrX>vZesAdypupdLKQ7%lz`TaDzWx03@4f@BaWXXSMq88{fNNG_gfHU2
z2&&Bl3S9i82ITtq-GD-Sz@M_Ca;5Nch~alj%oi>LMG7iT1V$!ogGG7nxv6p{<kU>^
z*?@Gj5wR8qOx$a>!OU+8x6?@nUQr+~I4yQ@GHo**@5%u-+#w_<SYjs!3%~1c9kiyC
zz1DQR2R;Ac!gP9s^VQ*v#=jFBRRT{myrV_<Zq<kvJXzp1DjDG8q~ACeHgG%|-SWYk
zTZ5d@PLNT{6ot5#ETmp~qeq-+33;SRJfmyo7H0JtC1Ljk(b+_S&thS5(Rl!kAOE3u
zn4(EIwHg(vD;1G~QfL{ls0%cKhZyruW#CSjEl%ixOY%;aILHwvJ-jcXk;u`lvf+}X
z^B_#Azp#KKC&34_TS$J_ItR=0dHg*9d){SGIK%=X)@1@Cxz-GxrUMCv3~p^$bIjq@
zl1uROIF5MQ20jhhN_K@s6I{lKuJ6(r1SaZOUE<x%bv;zCF1y%NSP@*~Ye-qM^O0xM
z`WL=Et$z9=>GqF(JY8}|%1ZMM;KW;;KW7A^wb+9v7`(|wNLSNcLG4g`O^?_I9ztd|
zkM?9Sb;rx;xsTn10Ufp?Q0#?-!^&unfwUrBvYclKCgJ{Xf`wev`L4Fo@zAF*Pi09V
zYhG_YhFU^)w+?Z`e^a=Bi(li4;|f)O=*&Qy3!Hc`qbtjBq=00#gztR@C^Jd*^e`QC
zTfgxK|Hm)=9l!2de<+B6%Dx|7bAWjbXL<XXXO8IV*uf?SL6rxK)KxsiPo86>ismA0
z>M<^8E@1o3{xf}+mcI6fh@mfjb9)J((PudTP?Yzb?=;i)kkjUfaS>=)dHUm<ehUDy
zvf^d88MQDvqp*GwK0PNZlRF?-VL`2f09hqLsq579PQq-{qKYxwk=x};8gHt*IFxod
z0V`AIoiTD2U9*DqGR01SrZMO0ljb1bmbvtB_!DGgu($DYMp>nLfAxd4LG$RCsCjn6
zrIB>X%;%U8l5<DDW|iVZZ%SunclHj>e<HXF#=_xJi5ShUmm~Yhgm|5@)e&{E?v4*e
z>H5hEm~EtT2|}!UO-0MxUUWOQ4jHkwy4g_tqa2X|E7ozT((tzmdkgI_Cr5~Sf8&*O
zb9E7#)d7t7%ui#sjL6u{WR*P4LZc)*2<#`o(Sy6YY4h@0YA*pT;OKUuh<>Jn!T3<T
z8n2}%pSqcT`o$Y*e~l5MAna&(2jIs20_Sk7F-BZkK(9sd^F8K;*SW_i>hPk^y?3>q
zm)MK3a&(T4VxL$Rvn22rsN*m`7+KrPo|$3^2j0BpQYU@E8`sl=>yX6YRM7)Fu{_>h
zo}{Js;=!Nr1$p1R%>m{$oa)7c!#}^cee=IxUTD1)9Q=8%#R;RUd2YEpT8!Z`^6A7!
z6oTvCvNDUF3phyIxRZUOJK2;C$BB_Oy`oyupYuCOx`9~SV_J$Pc$m30%4bzhyfTRn
zD6^76y2<1`h>_P7st+TD4vy-l?PL)e^|iv1kd&<B-v4NMpdhJ|t6ZFrOYHFO2XPLh
zumH=L*!au~EXSG1N}?062M{>H*DRGn96UL!Oe^om$L6}7p+YsAv&cG{=RvXfJ}wSS
zl&kzf7o+rDkduEiHRy!;|Ef<a$W@^JfPhVsuDcdbc<>lhIY=#^wvm|f2xqsfQuw5&
z7C@}cwFF6_3u1l<P~bWhC!x6rRTe6pu@P#OIrz{?VEV$HG<flL>OZ)d5`g0938c60
zAhQMCiIu^a<@s9X_`A0Y*ToXO$!^_Eh+IqyYdBh9=LpBcAWnz83?Czyj3Ek4FQ%)9
zd+F*T?L|yWX>HdMKtGgX#?@f)uHlNk;>#;mh06Bvv{~>zp$tX$n?V!TW8LuqV-BZ?
zyRm|q8;xV`S^&6%|H7@^^idos{Z*fJF}?i~GY%ZU;E!aPi;M5W19O1M6EOeqsz>0)
zcYo90#Y2C$|0{pbh4FBB5h|i(q_0?6TkCGT`H_o%8!;zWrekuW)HOK~mY0ef6f3ZZ
zoMGIABxsi8H;Ayoh&GqXDZmlw=tVHKr!!SD8>ZjN%SKr3T1Y}>tT7NY!fnJ?dsSkL
zT@S;pV9beWZbhh7vIJGP*Sb+P*|<ROPc9>K{?!kq(-pyTh;`I5hz(#eaLIi%N^Ili
zcLhipQ7vN?&)L2HFNen4n#WaRZ0b!z&sa7_P-Av%J_2A&O69lAKjbPx>h!tuGRj<A
zH6gXwyMd3SG56?DPwB=^HO>m8>ce*0g`MI8gCOyruwJQE;GPR9V#-h`aOm=QY%D47
zljB1<hQWYm0d^Bc+z+0;mill0^t6h+WJe!=Hf^$Zk|->WhJ9tIrA9<+*(ACiyvcU<
z?xgYYAtV?W?U8eg&IxS<KU&6t363UCX=`CQo$p;pHy{fjn}6mvVnhwY3L{hiJLV(i
z(e5krs)cRq+i%U;H);k$H~1jrB3-5C>x#D@PPftrcdw_#58g@VKX(HuWeDUL^Iq@Q
zl$P8-pXL(GCt<F4e(c+Cp;-Ru^^Pz9x`!^F?De=lC-j?#0vRSr=bA|bYhrFoouTK(
zyHX8hv@gKKXql5eCu|~C+#*uXgu2Zekm;sN6TX=;xzz`&OE2BevMLZVa_HP`^vrqk
z5J~*9m_L;hP6+m_|8G@9n|#xjE#KV4nO=Ll0uZ8|xv5mg1hw{ynNI2<{^Pc(sKT3-
zbyvE}TjI}TaPd(M4nF`U)S7W*Z=ue4(Cx_#(z)X{|22~3&f3frg`X=>y?+Kw<Q9CI
zv^H$;={cqUbPe!AV+9ZT!snea4AF8~jEcChK=^r5D{33T6N+hEtRVZXp6LRYxT=^K
z5>Fg+43OG>@@m@nRqsqIZ+$4;_`#2%3J=)KF)%h8!Sv{zDl^8rVBPH_l7I7o2jNWy
z4>TM?Hl;Th4;RCKi_ojB<A~zRNObx1@G$Md?V3wfXJ!&q6cS)_EUX9sQ5UQe0V<_O
zzM!h#^1cc#tXi`%6lE*L{^O{GnNUj%cB1p?N#rs6(2Ilg-gO)?04WZa-}Pg^=RHfm
z^4mXfUj-}A0j9a)J}KWGzR>%ojeciq40Xkn0g|8wiYRtgF^GfSMLj6BNby|}1AaE?
z$2c`#>PENyUWW`}1Bsdui8$|FTpfwOZ^C%axL}s=b;d>|&{ZQ+7<^MH)Z`~O!GDQh
z&*0n}5(C>5#NFtEx8L8r{rZXhtLdSSQ7sWzKP+9n<03m#3X6*&QArB%LIpN6_u5eB
zHkryT$bIRz;h2wkH5#>aQ(bQKvDE2c(vWaCPsKwpkF8ID#*%IZ0+9-cG>=mDz}8_4
z16diMUK^(a9P_&kudEXs`E9PXXgV`R0#q0$BIXCK$_wz+GKZ4WFU=__x!X~+UQowM
zr=Rwoy8-vy<Fx$f#WdYQ=3^S}0c;Hst(Fud<+6>_w2U+>djOd9(1S>7iGWMQwb0#|
zDfem3CkU;>@c%~GksgEN!XW^LDQ_6ms3ZYeuLVf5qlz!BQ{a+2SN0HlTO3-f?^)7`
zjaf4TS|h<Y@Mr+;O$-gykFE8z1DE9w-Plg=d}AAu*9a$K?k!>Ke-0kIuhe97fH`9$
zy_T2W|2x0w(>Bj9{_Y9%+vFmV?n$XB1R}007wM|FP%Um+Rz-UZBt(yAM0UGo|5VBt
z5g3_Mv}>y8>iwimR+EuvLm-g1u{6O7^h)(HgM?9b12u=9*ziu6?Kx{uMZ2v96^#xs
z$sE6z>sx-d#Rjw*BbUDN<W>FAQ+`iwPbkwe)YMPfxOv;;mg{3|q_+XP_T0?eG%zbl
z_PT8-q5_&yY(z?*iCa;p&!I06hod}>rkyj!LGMcU5_d(pkOXkw&JmJ2B1R+~TlVBU
zhxp7?Mx!B&)~h((Qe!#m@MM#(p$^nnpcM};TTHF%yXhFVm_E*jX+5x+4jwy7TLi0^
z_z22kUuRTJJ=k$}_HmT*F+d3t%n8nkIqASV4j|J8u(V<GSORnQz_gz(K#IAAK1={I
zbh)EOz)r^qn~_T~5g-}sR7?izuB4ePsl@!bB$C12fXkZf&)bIc>8{^xT`L_av*UEm
z!)A1tZo+BdT(6xLPxjKdzxf+*%N$_xPMd%D4<3O_7uNqE()iO=UtVQna-P0-!*e-Y
zB7N;vTsYUVp^It0@^*F3*{@d%zMw#%dqL2H7H5e?n=dv|fRs51WE6-j0!{#fo7bLp
zgekPu0IP8DX4n=f51?}(Xo6*_FWNe5sIMm?aL%p;NcwvaYrh`9n)=X@+R1asfmt$>
zr&bx`24M-ZJp&|poV>n*!FklHriXpV<xkNnQZEM+9|mi)Nlh^kWsO+5p;`+&Ssr(~
zY2i8f8#0AGZlK)^LP9zQu^t~XQ?VPbW%@I9+9JSU$Q1SQB4KsuEHe_4Y>KmmT0$3H
z#>4E5(*T}c{kJ@t*52}9+W+W>($>kktr5p}fmY@yyleQK!Jf=N&NE`VS8$ZFf-cc6
zxK_7ea6yhxfLJ?i;)npM$*!W1BLr3QMMniQPkOsT!uR*mooXe_@*;kd*)Q`GzP3)!
zW4(5pwy>h6oxn+95Ai(LanReFksPy&Y|$s({#T;V|8KP*UiTm863i!gikFs`{|lac
zNpCVPRVCJwQI;-`W}GcvttJ5&S@9!uouxO4q^Nd83Z1D^7lJddI8VB})-Q3&DyIU0
zm+QZXQT?jPyAfpl?(Hn(-aNmY9L_TMDzI#M_Is9No|Ug0yi`|dPqT-1z1T-)Iv3sr
zB%Tpe=iI+FDEFq(UP&BAO*L<I%bLu%A%wLl<}Y9H#+VdffX6~ZQt^oit~$?Wk)Et?
z)_;>zgxa=-Sdxr&>_%~QAi!7#$4T0Os_YmH4g+gUpeV$D5w@3I23uB@Xy$ADX4|-c
zb+8wuCEP(V0{^ezpyS<-!4C5)KRd0x<I!~WU;QGU<-JsM8R*#<mst<thK=bjE~SI*
zowWAg6-HFTD9+qu%=!!!8A6LX0HM`sH(efg(;8CK43Xl0g1WhGOsX^|imZ~Y7=fB8
zQupNSau#q@-B)?GALEtR)y%bEp7cBRIo01|#3Ua=@_J!skcK#jt=EEGX(4^)zx;+T
z?0wl^`za=WyDwgIfcYd&^Pm4?-}0G@%Zr=x1fyA>>kldgMJw8>bk5%O?fULqln9Iw
zAtULu-MF1WgvJ)W5u6a?Sw#etv95~zvTpx6tN*W9%rY8Ye(b~0F$cU<+YXfmkl5pv
zuF3%e<tXS)eeex@#+c8l|M)%9UgZt$<<yU7hk_`3hOpQeHDM^L5B(X79WjpG4pZR$
z4rux<81Di=g8S%~|32G?S;AEMQN{-Rdo`Cas@{&OFLhb0C%h1YA@Rj!Wl3*lW$;{i
zdX}{@m`B5D8X)Y`GD5Ho<}}Y26FyE4!DRzLC8g9rP!G5#ytEu$TvYT#gV4dK{O9i=
z`{hyUKXM^0Tv&!gf_Rt~;H>UPAaVmpAypwe@#Fvry%CSoLGsH5<Q$`xkZQ6n`alo2
zWk}PPCdh1zl>WQxwyqQ}CPxAjOX5tOkj#1QEE`M{U>vnWI*WDTfTgd((bHg~pU@B{
zp5>TYh%MB1NL@4T_R|d<4?V;f^?5`B5?^4Qy%{%t<cuxwy1X<8m`~!gF0b_8#^_`N
z66a{e1>>`5K!A)(Ee<fYo&#(OX;$iS(F12$D<-TEH+lit4<ciJPi&4tkeU+0>@eJm
zSjK(&U2U<%gx-~^LMNiaNLP(Sdo$3*PL9X2&rjia9bijWYBh$_K{myV5KR<Q-F{u%
zx`T$sw)D7$pG@KdV5LF|b(OH(pSAwYa>p4&Pl1Cq&PW-hSGFq#!rD>;|5*}Ut&Y(O
zshAX76L!rU1gs|nOUYca=QHS%)f#99NOD*@a8)J{jgW?)HkFrrWstCOCy-GBTEH>N
zhu3bV<xhWDAhUl1M;=o*C)K2_msr0!CX3er5FKVchJV#yXD@XwAYlQLUrqpeJc@`|
zMivH^o^&2k%=t+Nsc&e9VR<~HjB{jXABBNGKG(+o(x(7f8-I%};~dBLin+)jbDBMh
zIn+@Z8hfUaLA%k`5u6bQIF?}N_ykUIIQR`i+QBt=>Aee8&H?77SJ?d5S26-iON;MB
zDiz$yj9)>7r6bd(3WCA1?A!_~mE5VxG<H3R54~z?j@4`%HCGjciq9BNcQ#NoDZSGg
zJ!6C`bmiK9g_{kcRQ+BP*d|S7?V;>0eXt$vHO&zDc;2_U_tiir)8cz@^}0(yQ^Unf
z|HXCNHIe%nWAp5s+Smms`LDUn?igEhxo0ejv*t(9=H8F9B)716=)_EKug1UuWcJzk
zZfgh)R0atH5(a2?d1N&L8b^!cw9{qeJ7#9Y!LsCyY!72$VC|4<XA!X|7QJ6g{j5Ka
ztS5X_+oWs5TH{HTN05+^mu&en9!_g-e<<Dg*=N%QdQCdanbcrwQ<)&#rJjvt-ken=
z<J?E`%azOLp(exlE}?9uUBc0HZJc1a2+8X_>_m?qw9<9hV+Jtx!;@L=$~B-wuCx+-
z$+?!nDGO#=x9U(~6quQ(o{y#%S#xsheaXYi?8H}U2~a3J^Y|SWq;z+*m~I?)(v>wB
z`jO(Mw|wa<P|`m>W2w9@FU=*GPvW%p`it+FEKu!0f<|4sbFbofn&6O$1KPv%A=$HN
zHiA8)fo=H$4$iZxj-rpQyBD;%aP{_UE!Bt%wu=Im;*M;*cf4Lo!U;l{H+=;pRv~Fp
zZF4bjXts6sp;z~N^sDjS#GX@qnUzM$vrUQTrP`?~yy+q3omFn#u2W;G;h>7JGN{jh
zOP;sty2jhbHoqIsfVl)Q20`9OI+GLT@PfwHY%Dg%l45XuL_<Fu$*ddku(z-V<P79?
z%rTc~AIARer(Q@~<Ihe@Z+Za7924tCFxhx)4V$6$*_;S-e+EFo(csQ5a;L%;)Ist~
zs>v3Re~dZClv)R#Zi_&abFj;-Kr+DbpSDcNtYuxUXHxb(P&*j}NY_k=T%pBO**<78
zO!obh4X0ubZ10qj(m`gF>KL1Y@fZQN?;gVb%G5UqdYyK9pW9r?ukUYjfcYfOZ@=HW
zZ0FH)3DVf<(m}PLS-tePwHhWk>>wU2fgBRwxe1(R<j)Q&rXmSm(^MBmhzjR!-(9t%
zJDPPjhJ)=rZF{#esaEitGZf@@J9tX?A*UL5WXjvXlcILIH-IU5=o!+>S%kXo!^;8T
z%T0d<+o02RpM%f@{pGt)PSksE(6WbWDK^^T3e)_%tr3$>ni5e_qlV<7=zZMFB#g{U
zDjNmc#KXC8eUHKkfTo+%I&d*qyBFauJYvcityk-eemYaqGfBj66HxURk-QQqW8ia~
z7B6h1#VZ?dT^^)m80?$(aw{|$7D`iXWCd6Rz72Me(EI!rl3zlmLJ;T)Y)1=7-nqbo
z!QfT41rh8VT&{0IY8o<IGJuisFgOw$&4}YMcwP(7fN^`0)?xuRepVLqP@8I@H^D@6
zr~Y&Q`Izvt&qK@!vmclwNT8$`!{BJ1bV+G(G2Plj0&+M(bZ`%P@#dfW{qOC+_gj7v
zKH;zb*BoFziF1qtkk;u%ql1YG71s7O1HLU*rzC~w2BCHW6okm`^9K8(7x5A3RMJck
zIRQ+74mXJdjcuD;oVJe2&Cr!c#j?`Y6_){1DGqq$SuZDpo^8;0H3@zY8JCnodOB-P
zDz5QfQ&yU3oOuj-Rw)%S*BHur{F}UWU6YMXU6zemNH;by420GEdV6c4c9uHK<~EN`
z9_t3MRlriK$r>s%&r4?Z08_&<_G-%XFW~f%p-6{u9pOB=F<gAjEaKqe>x5DKI|EPP
z*cjxvugsTz7L#GN3rRo#sQz7~slSSITi*SK)P49|I(YUf0x1#P$-#0U3+Yv39rhWI
zlUeEX(lK(Atz0^n7M2hX!=Ou^bcvMs$nOUbY^RM8j(CPc!i!Lo?IC;d2<J{Pz>)dO
zs1~y$2naT`wFp^?eTL2U?~zN{=BVt6d#|rx@*HIHm>c*{6i7WmMg+Q7+fmWe{z|%g
zybpU!H}#;+=$;&{f(7^t)bTU-s^|4|eGV|6#M#6mN5}!iZ0$y?$@f{fh-PBM`*MOr
zYKChYO-1&cgEBG1z{ARSkO);lengbS=z)l?x5YQM$RTkk+b2qStU~@_(ew;t0g7I4
zbj)fo0_Q#Unthau7o1&)M!IqT9@Jd~cqWkY6-g#FvYj!{_agZ9t<8H1#~b&YYR8#%
z27t#B$cQon-ah9xn;OR`-!gOCc<ZR_%skkA1Qc@F<k~r2zC5%(WF+S`aMwf7`Vl?J
zoTy3)mS-4bY;yY}q<Ep-k6r;$NUhQe;O4!Nw9Df%`TXMUBpp6}6#(-{T7Sy}Y5OOi
zOczaGXY;K!#QGhql)47}Bol`6mH<)*`^bxiTx4ye@IUDvrjuhh0pK79dPPpK4shr|
zx^Q@s)(;_(ts7VZQZ%>$F1Ft7kTE#{_`KA%nv$&F*5a&$GrMH0<(v~|&giJac_CB5
ze&OAMsJUn0gn;PQ0kTc+4qE9D2T*L{c<27~IE{NtUxd5n0CUEkoL_nsBY^1Wffm0)
z8C>0f#TPz4COMc2Nk&6A{2T49J+o{BgyN!vy0}Om3JnpUNG%?lIJ{#_ZZeHYEmEKf
zej>B#6*13<qVnQpgx<I}yEh1BaPj=z`jp!$+8W%UZ_Icu+hL3V&8n5|WxKH9DMY%S
z65Hri8~sP4l}!YTZ>8R<@9JI<;;M#RZ9r!=FL_L6+7=9eO{@=cDYJl7t#<VJV;dY;
zoI5biJWG4_yGb)eji&F08_tN>JRBl+<gmApHpO??F~Lf2&(tf3H6eu@^Y~;Q{&n^c
zm-XQci@K*}f^5wosJ9<~A)P}$vE_#^r5iZgegwyXE>Fu8-R0fB1GE-%uQVVnNGNTn
z#EzjRTiHAZ!wyr&AXy|cEF;E*Df}69c>(b-=Ww9cO>7%jd8CRBb(qv&Ixg9dMi#cY
zZH9H8DZw}cRxhTR&vw{`W!T8lqhf-fcEp(3^c!$mMg=quZQa3vX-E>C9)L}$&$K;@
zE58~yed|k9^}6|KF2Q^fr?TB?A9Rq9$=o1msAOg2J%dtfMQTAtM&3fQiv$xYtg|`p
zri#WSi0D+&lbfVdI4x}wxtIp=w`Vf#&jj3p>f%+(4nQzMSgu#N;}2DWnU$3RAc*wo
zXRI+K3|`0#IGiQH7#((K^9IE={$z*1RaCQT^jWmj6q!v>I%^J2x20LyDI@Vc)?6-!
z<Kz9;wFt<KKH;6Nuu_dmaxq{Dn?QIZGNy&`BeEY*>VfJe;lR<uWdsAlyRjp7+f}Q?
z7&{p+4Uv<K>o$ZA8PV$rt|x~C+qrhGu`X0xCokMhC&-b~Ilq=V7uVDA^H!b#yi4>N
zR#ARQ8g_OJze7U70ciVFlbyeiT1)U0=nm6J$Gq``WZI8#&>2!>UVt5j-eePk7XY!X
z{AE1+i|5IpT&kyeOokXJ$}9Zc>ZxrowREFa-9z$~+^F~h2N;YWhmhdHG3Rd+xF+*&
z3~6xzCtgm5{WLg&^8yV0(@jWtC}Vdzedj%v!0Ybv9AG|)Qwj;@CiULtE=fs+mC^9Z
zedg}B+PKnDg(l0lZy{`1@9!M70gg)p(f-WQ1{W1niaW*tK-*g%;@KKF>~jGaE7($S
z#C@UuDlt7)RB^GkGMWU|a3VPU(_mGT@LHN^fR>r73q0sYyV8WnYC?0T{hEO7r6>HQ
zRB!5O<o7wVm;$tC5ve?nv(Hw*q4A4;x?@L-X>iT_PJoH?U`~UJ*??#dU<Qe?z=dGM
z++M9TSVd-uE~EjS56WQ_TW{yvLxLEOVCY9uOosz=8juvunkz%NEhy`aIC)4x0Ef=r
zAniYQ9Z~u3NNbN=Nrz87o>uy+e%>4<b(=c%6Yq3L$dikYJ-Afwz%V|BnydqWnRX9#
zEC3H8LwabqD?<`MzcvxkzmBFJ(p!tU$smDc`wN>Sz~MVrpPk?->4hF%yzYIO1(<uS
z5L1s8fZ@1IIIdES8Ne`zRnj%i80bJX(1OeN6v;sEDB!Y}mZ@26ojm-~tMK*n>l|P{
ziF4UrSom4E$p7XA<%hC`ndKL`P8m6ow^y*N-X7bxMLKaf(#|wOW!zu=%HR>3KDd4X
zOvSrvXB&SmrHubpU?fVVkYBrzSvhd&al#bDkb#3nd``7T7tF2`^ggTLqOY}7p{6w@
zA{z-;|3n4e<_wxu-MK9_0kl@+=GpMhpp7KD#sug~<F1!OsQh^KQ~<jAh3hqnSLJ23
zi;sadF@WEn^=p!XbAaEh^x^=+*`tj>_t|NvV-VNf71#%u=olQfMeTENY;33|BVjib
zh$JlaD8#yTxUZ&MtOyvY-Z5fSo_HaxecoHs@*6Lvn+qRNoQtSB4B_>N()~E>{MsXO
z?}fJ*+I9pr+1l17yvdO3?0}K|^sNR|z)r*H{tbAPt;6PX?E(zza#VA#Nl8h74{Scr
zf7xzi>!(yffRQ?uwi~y*@O@@|<b_f%2Oaw6V=L*!jr$%4Baoq?A8nr^7Dn+%_-~B(
zo;wF802sJe1J!Xr^~wkSr!RcNd;a22Ju_R+>+Sm-U_Oa63jXDX!P0@_GRuf`;?8et
zNe96ZkeoqCf(UZ;%VWhFTfoA0cjme?LfG{}P@9B5cUeTm%U~5wkY`i0<onfxCQ&wc
ztO$4LRjg&PJxdX10%<X@zO)2U>J9+T?RS7H;8biQq3Ehpmhe96=o{lF+S%-%lhJ&i
zDU?o+=e<;Rvv-H<D+{D#drBfgyTa-fWeOx*y;-jl9<Lk(FSzJ<k2S)q%sBK;URJb?
zF*<&Cxt)f67@Q9!+}b=j$aHCBM=Xpn1Ek7f(<&P!W1?e?VUw*-+Vgr)lWjkD6UQGP
zr`{u5=>%cy(?dAu!tFSW`wU3Tsx?;QZ7^YG0cx^dIz&|e+C{U&Ak~0?=R|JHID`y9
zxDJ3hzcWc|ki<r7$OHfhOs>j0j+tl4aO3eaB^F99Hs)cUp?k5}W6Ey1mwL?B33qQ9
zb5o7MIRa^5OoE{*(vBDy$UPdM!qYdq%do+;k*a3_8K&W|d~yf?n!FW7&H<(@tNEu_
zI|9%Je+<t3&_DC+b&AhC=fxJYxV-=sUJBKVv-)<fIIm|^IaR#`5$)<C>BLQ$K%CL&
z?E9~$t$c0{i?|M=^U8XgrJ$-m#WBF!9%tJ$?uqhArC4k2OfVx^yi8LB79zl<%!VpZ
z17)LCzw$Wb2MfsOlAN%YajoJ%0-XK%nm9$EET4NhwOc8>FdFr9b=3GQBT{qJp(?9m
zX2?-1m5X!S9h^-9H!^sW9i{2E%;UBeMc`pWfvfW506FF@BP3UYxie=k!_$)ypddWL
zGYpuV+}TI@IHmI9TIxN3co?MbUq&nlGd#!Aw&w__dQb(c9cU0wy4>9ZA%-;2LT<7X
zxGYbP5p+p49*inI+_nlg>~lDS<_gl^OfGei`^$iZ!IK@_M*};w2zE{2LrKN}CmaFH
zow+bLrvav36S3FK{*`SnKDM&KKy}C1mlM3xv^0byGwk)!!2u-4!8ARvind`t^g83W
zqq{%!%68d*x_8Ybm`~!|eeqX(+5V6G>mR)ZBj$ycI6-V8>auMo+3~bSNy634HX4aq
zv`d^PrJvxMW5Ea0Y4(s)L`)2&-J8`@xePB{^%yvb#McsxKKsVcs<xU~I>$;9;SK?(
zwMWtB0v`2nmo^(As|oFC;vYkG8hip^Vkqs-GKN{DC`X671?pr~cYsh0R#sMk(@DeS
zuG#L~em?OMs<hd;s=uH0nMAObW~`sC_9SEA5>C9!o66N^REg2hPtLvt6%h|A8?G#-
zL&T44=1I4;LPAXSlQI+|CmBz=RNpmdv6iYn`+Bhj#sy^ev|uAyJUT%P%+0ihSeV|M
zE~Uedel#tgtok~(HSH;#%>=0jecteM8+MrP0ghq@$Skd`nDj%?Kz}H`(Dt!}KAj&d
zq>DSq`88Qc2R)1vgE7&bHb91llF9l)LqGc#U{isL#)`enAXEBUR*ErR`j>k~sxRV~
zrIrb>(wK9+(FO({(e}f(ETFxR<<ji~*u%D9L+Sv0mV2L5uf*5u=Q+T9l4ll6?cagh
z`3kBt<ce#tkFJXsrD&+f&T5V=VrRnrhST(`X}4$+T?e<B!i}*H&d~*lLMNgoBQmGz
zb)W+`7J+jX{&JSbQSAq91r-&SzWLwj&D|u%-qyx=I6!(6>v55ptGaO7v)RDql}Z@a
z?$c6^L%sE@`ge9N?}a&Mfr<gYgDKaD7j9lmWv&G@G~&C9ve^Ob9uhqGG~qu03rqsE
ze}Kf{bjxWvKoZt5;$aq<lg#8+F$xAw5ONR?(}pDCi?v<?4G}k-H!Q0S$^qf$36#r_
zl8!$9Vp{uxx4{l`Dc$URNK!hb3kv0yoYd=%<>g;Z%L)JiP#xXbNlTmS0zO9aPax57
zz0sYy19j9I&Wu^zMh51)M`;WfX*oaxh_n>p6kxg1Ncfq3xemg#&n2m-+ihkTxtLC|
zVjNPRmyi|%=qb&}Yme#rk&JT!fIdNTQ8F4W1b~ikNWz^%9I8e;4DK7W7w?OdF>`>K
zr2?<<_qFBaADS$je8nQXYvlU_8)qmb8b;?CDdgq2+T6?_K2Eip{Jn|(!P%bbni$XG
zJ`jEf#)w8q^0l#jCQ+wJ$s;|vZ8YB05E1;jLZ4LXSxn-L`bs=?MUmrUEsfR0zDPV#
zff0kO@^C4wmiA^rxE`NUcXLET8am~ATsKfi)*Oc@&$@0BiQPf2ScB^s8EotWU}o1r
zZodrGHs+EzeEk@}@FIE-Vh@t6;~{wbBRTIeOw28~Di7CsIu(*R$<zu-vs5RUyYCPw
zWBMzMz}JrkD$Sv&`e3(NpX{b3<R9C8{&t!o0Hnv{l}O(|+Qx~M^iCsJuX1^Bwm3;T
z$?-k7p@$UyyZZ;|8!{?Gf<Z1C3j{^bBEgDM*)n>#u{TT`kYWzbFM)7FQiGb0DzQc8
z$KiK(ZH~|3X=d*REHMVNDl*49%?Uv(RG$Il_4^4kC%`T=M4eqoFJs!s5Yaz?T5`}?
z7Qh_B3vR6|yXBi6`-AV=eA}P<rMvaId%b<01I#CRMtiOH{{>C*x3rY2$d0KE+N0>>
zGYIl*lWL-PR&yI1cum)WN%V(;pazj9Cn(j3fX4sup;l<+6AYbzl=>ZX9Hb<biOQS-
zFx*ZKOh#C|vUn%4tU7kKl+f&Xjdk~$!iMvK0^eTlWdk+?Dfyuybs5Kh=3J<2Gn6>S
zE<monBS0b#pp{w!OpM2@Ehg55IBQplneam30f+%q=mN|fWMUpDLBLJ9-G^kexrB&v
ztVd)()&XTw-&%|anIX9<`}_zlHQ;QRxzuYKbA&#p-95CFmy0P~n7$u5xNwwm_pKMx
z;u|ie{hLUH*$2Qh@Hdzicc0+;b3jvE%K*d~=fo@_s((rq*bx{W0ApuD>4a*q4pPSq
z(>d6a4yQPJ88*);RC|a)^DNS)S_9<hJ!~#C`qL@kEE`M>G9?vK0F2KM8G^9A<afx5
zP=DcnWTl4vg~J37vjM#Fm@;M$z<2}+1~UQ|=E(-U73i*f_vKdQf6Bj~tsMVPxkvwz
zwf&u+|2a=$D|~PSPG*c%r$y;*jXHQtzVz;ea%#c8#BY4|tUs3Le`f40e<l@#7YyFr
zGfi3-*Qz*mbIpjSC7}vN1kAJ=LD+PSexUjT!QmgvME~kP1p-E}UnZ!m)42BFzGu}A
zBDBt#TlQ&EICB3+;NzOT*Y_6t#LIzmbGbMdFYhAzGh%9g!RZ>5TH2e?-Iw>~wYvNa
zNu;U%$D)Euz9(!2IM2LC+2uTvs^6Rkf~x;<EsQ;-47Fv4P>~Ihx<sliw?n8rd|rg)
zz;j_TGZYOM2&vQ6pS7a2T?iq`p$|L6KC&y*lXCfuh=;-1E?Gw`dBDb7HoQvuv1AY|
z1KwmXm>=B%4F+g*kTlc42LK}jJ^%thvW!k_+(G7mJvc2mqoeT+y9%W<cT!MWv%O_z
z_RqRrO-`LH;1rp032@4FV=}5@$fg8IIUID;0qjJ(IHi+Q12+g1F?#SnGII5rH)k%v
zyyioCy1=Dw=lc(jhwo;l)l6i>@sWaxiWU(fZ&xcv1d`lmr7qGlE_L`#aVbAzAB-EU
zpDi9_QHBb)W_aUjH|0|YUwKoTy)@}SF4)nZ>IR+UtFlG1^hI6wKHW=I)#!65lyZM;
z@S68n`MF$GYW-8Sm2OT~yg9k4WpezI4|FAQ*cm4Ed^CBqOq6FNq1yg4tM+P+Wh@0F
zx^{1Xd)VUqy005G+t*TbAx`HM$fz88lX>tZJ|6*GCLr2V1gI!R#pi{@n*g)DOVt!l
z!gL=q&S9)FbtVSSM^%JB+JWETR@@)pT>58kAU*Optvqrm-C2Ud4qjDlNH<|HKXVSe
zXukE~O?HUfWK*cdT2PU-5xBU3)H742j`85m9&9xm2)bOneVAGod9oMtmVw8idopb!
z1T#kUqm7oFKeGrtPt?{U{&L`D%sy>W-X&Wu7G$7dj|>W8p2Fru<_<r?*2Tn^BizWd
zoirzC>j2Wr9!}#VLqxCAZnt%LcJ5zq-{%1HDVWif#g!kQOk3a7B?rf#%TO7GE?TsN
z3*9J%2z^ld*ld{?TZ3bhn2O6m#C2aXyk?b#m2En9v$$j;<b<g~XZ2hFKs7kR4kx$)
zcA2u-rsFvnu>%EUjr+=1yXkC?uJVYrxB2pxXX0)31$<b2HjageR{)U5I0$!8M5nK8
zRRs(LS2JS!bY#M3<Cw=o6XI<%<h0*>xMF_n4mOU@nzx7FV!s6S^7g9~cT*3QnX1ZC
zoqd*#h1Z<e@fjUj7vJ%@3*x3RA;HDx4pOX)m-|rGz(DSUtc=8po52<4)>3e$0gN#S
z?xB)b5Pr+)hS)7rvBhmm-KDhs;!c{}rMvQSI(c9j2a>^E88I$Xrq%cH4q$4yJNtU0
zq|!Q`q~YNKY(lGPLR$;~3;ALooxw{Br7VJYuL5vZQP2A1T-pb0Oyv{+Z#E{p1lW<l
z6Oia#7U^UZH{&kL0vJlHVung=OmqDeG2a|1%qAn-mTMt;%%%)G%mlg8P7nh_Kx>VW
zI%asBMn|j2tb8X8QArnL)$R2d3-h{u%>m|9Fuf4ve-z7cit{d*)k)lxNE8=yfJ^4k
z@!O2MQ@N)!))`h^m<5O+?g2s~UX>)18Ff)N1GI)A-^gGlkjA?ff+<3kDPu0pfzci|
zdgKt{s+|lTvyuWj$Y|@cUiN6|t@;`T<p=E9QoeEg^~=7;#qf(|Lum)AZg!Lr69E7K
zq#Q{^K~%z4Ae^;Z0@0?lG}~HlL)4PNjB{Mi7P)Jd=527Whz>+MYw0elBsK0frlA(&
z_sAJKrKpHMWmV2<NvLT6K$KGAOcV0u9i+jZz+HL5{9|k$Xtvf3nSDq$#ECI-l2O{w
z)Qd@paXNdh9EA*I$fc9W?<3{QjqTLF{6JcJ!{v1J_>(ZqXO_V5Hl~<kN07>U!P?dZ
z$Q<3>P5o7PlL25RhnO>XkFk+$^p`5AC8Wb&+XW$qJMh9X>`RO0b++h$!k84<U1ZpI
zuyGrVf-l*31xNZgL>qjX*+xk?K3_@`<0@^AO^a%8fJ{sAF92_zQG>*mCy=}*@W$)*
z*e2Lvh5#6hakqmB>2$B0x#(X<7w0yZPr+n9_~~!mU+J~KYXtQG_!|qxl3l|C^nC*x
zA>la^{DB-2xlP(<=mOcar&vy;GyVi!Cb4MGXpgc29RSe$a}`qNoSlFL!8NWq@-I1D
zk<gIji)LCI8|_fq8-K>@3vwz|pV2V?Go^zu=~lia1}%R!CM9}0x6Np+|2dG!XM!rl
z>W}GC*bg;2x5PfnbzJ_tWqjE(9md7qqa1UMjb|40Ik@>Cy3u|c8)fo!)l2wtSpUi8
z^N;tipCJAW{^n;AL9l$5pwWT@6%^=y7L9I1H^zwWw=_JMk=@$M@5;3$?mWyUjK|=E
z)^7%!k5^z2*V!<5P99twBm57JpOV9XhmS=c)Tz_%=E4TQM?z}=h7;>P0ATNltD4>=
zWLF+a9|uCLsqlGel;IL1Cs&<VNJ}8lBOGFOLa7RBvKGK`0qxef$Uy-%4M<&^dk84K
z4*7-GNc6edHD)#^F@73DMflqvKQjO{z@;!Fwyr{Yq@+m8RQ$Q}4bmcGjo7QQz*|yH
z+Spt97`tvi9l?DYW*EFc2N2l*DIh=pf30FJ!F-B_e`9I+f4{nO^mWjBXCg1IQ2eo|
zb)zN4=gGzO{IhY7TKLXLhMguDOp{=IV~8G^5!?j{QDv)<8$2k2{jSZGZ+2CWY7~X`
zn*C|nBfJsOo(vXF&++@TisY|LnCbA$Of<)%)E~DwHS}@IcZcNQoW7nbKl;O=B8#$0
zA4(%i{pw{5&TO;t*jRhAaF++$N(@z(MxEoPD(!Ptw+YisT8wIn=NPA@a?qe_GNlv(
zw46CyYQtxiz(7RW1H!>VjwHO7#|(@3@oX4K%LEwaBqK=B;_uVUi2gB;Pj(WU=eY(M
zHIk#_e5yc*?d0O4DAY<Ta96(l)U|YSFiic+Tj};XB#^-n9$d(htTp56w;2gG&ZuE|
zi!hM4pduUWK~1){3U4yxzNa^tqVrh|C6$x3fpb@`Kfj$O4=<;~HRLbrA>kxX?-W=w
zm5gOm=84gK4oN5VJ%TXVjjT$}^H;4{j=%R$<DDfKVx=86MrKfB4Q6r*)!1Z$Owjl|
zMnA_GyM3H%Gk^yi3`u1_&aC{`)7Lq`e2QlC{9E4qBTs$gmwst7?YwJ&i>soOvWo-(
z%b?-r#Ti{s&~M|0n}pGRGtwl&LxLsZ*d(OJBl=4~WMEZb&bTiUj&i3{9w&^Zs-eih
z`@DjRme*Q7@x5NSb+Zsqt3{o0V5^!YwTzW5W9g|)e|qu?_j#}1%Jo`pc<*HpKvs`o
z08|+<A9QupG-74s)Z45pAYroOv(4ja2JCRnV8fobU_c7A1d*6C7G}m}jtfI6#jM~@
zauvCE4!Z0kypz1PDW0V6;|By;4wn(N4+2cEv2-xJ*`yhq#+r@*1PunFo&)1Vyg{LI
zG<9Q0&#b)%FR+E1dr*_XUHQs->OQoUhL7#zV6lF5#@_u1XP&xhf;GX~>>T_`RoB6t
z?bP2~LkfSUvq8Hl#SlFBA5fzYH|3>UhpBh>I32FraRUY(%(4uxGyl_Prkr6?O;?%K
zdC4M2o#CaezmR1#rUG*ZbdI^yW#%C|27oUz$>tEYEvgzZ$&`Es7}xPW#v1@LlG`}i
zKb`E}d)>W0ewhQzr*Iy(mY4tS-NWg-m^Q{ZH={a=HF092&FMrUkfBHwyv+%5wY@Xp
zWf|_}BtmC}8yyIcsR6+%%}l;-(fy7Tcr(8@E5tbMDA0+^+4R>&HPhIzH+Wq^n=1Nj
zi8i&P?NJg**+=Y|04x)(qS8Xdn;mRGsI}Yst+#57tTqRD`{J(}AY$-hRGsLE$jr!}
z`pMxT=2X7G0VMbwnx<d|DliIi7FoC1aM^57hU`?Cu$j@G*@!y;i*i)51zLux^#rUs
zHA|t6<=ORd_}$6+l8oll9Y_>3^|N@YjnLXf<j>k4AYU1{Z@Jm}O31^L#oflB`Dk(~
zT{Q(_zQkmdPv685%A08!@h}G;#gWQ3k<K*OXU#;+t$w$;=NjlkO?Gs2B)dn8$uDtu
z7osYWZ2(04*z6uHr14pY{%LC$N-V^k9K#ix_8Yl0>nH*<@>7zLf!}|l>WrWhdr$jG
zZR1;bVaA#>;M0`YKkjC45+#lOlpd#OSDO+b$`HW7;6H)~Tnndj(sdllJlK)OA2~gG
zudgd}fcX^8=fYtBM+QgxUq77=q?0vcD2@7dW{eiKI7#>T@<pEanu6GjAd+jVqn<^C
zxfgMR$eC~8nqCIUwDc8jxDMWyV3L_qyOZUa>v3#22c8P83fvUHDUwE@SEx{&j#R+H
zqGXftYb4F;qPauZD+WMKISndSixGt2{Pmu3$l9NoZW%rNcc);{W8IU{%m%^AbOPuw
zYXfA&nB$YVb^<^J!wS@6!}8!-Io2&xl||mNP?hlx`dAsT@OPJ9Tt&v_5ip#}>77i=
z6iR&q2;=<Y1yTof5HcyVEeya^TVROB>F}dvcsoUt)J~fWW>r?45>!4iPjLM5D0s=y
zuhOL~WokMFGVt;OsvW`(v;3YnBJt%`x{FBmbClq0aNST5WfuIGB;iE7gZ8uzkJAup
zvZamH)P@u@MgmC!D32Hz;y(sct|BwR^34MznQY^bGVC$x!#LlvyVy9G9m{NF7_wqF
z63%HQ`Mk#3vv~-<B*2AxOeuP(D&uF5bE9f<f;2HB!~roRjL#9HA#ef4;hu~yNQ9G)
zkM4e;%+l-g(;Q$vh4Z?<ef_Z^4TazePpCSeH<^eT32+#NxyZQyjVKoKR_$W#O0$1t
zVrJ-)H%M^$k~{p-z#MrsQG(JoDr&HBc5)d?o65332MIT|KvB6hIrS`Yp=~;3&54|(
zpiIRw%=zu^vg@jcESzuza>GjpqScv58>QA)jfM(v^@Tr{F-dg7>=o80hmPD<2Q>$Y
z`gc?0Mca#t%&VSJ-*J3NSiBym%<(<cVs6g%=6l&Lp+sP70HK}<mt_(&SCNIfM#GR@
zXHANcT8$J=1ke;^;rT>jI36FT%gjke;>X)WT&*3NKfE)i79{|sanUaoa>e&7c{U+P
zOA=-G7kg>%`W;A0`>BT$F5Bl<)A;rX(f&}Au}@VHq)lW!%cgRqDy#>uFrE>!w6!h|
zF{XqWD;|dKz>rqZ*Jao_7Iy$dH|`?V2Zx+dTA|befNCS}8dYg+sI9u7_Lc<G*QzuM
zF2%U(a;!090{E@AmxbiQ;pKXu7a95s>19NZGJwkv0K`B$zy2cwv~Pgem*c~MY)>6H
zl`LZKPvPx0n)aT~b-YeL%mL<8II(x`TzmWB0)oU^hz}UjWf7bmI4zzy$Ssv4(*+9T
zZk?M7k0f0;;$;NSw!Hq$Du#Te?2o`Erwuh!nTt&s1Q9VS(7lRWy`^eHvZ@EoTy#WA
zo!ZJCz{{$SznKIR;1tqTv|C)hvxoVb1EV~J^sKd`z>|Nj!J_(H2FBa12OW5v!`G07
zn#Q|r<rh7Bs7i9(GRntG1$f^Iu$gdF5#o#;4mi#@)QY@NyNH+(^ra3n3CG-v%K=vr
z=MqN$Y~+_M0QZ8xjXl`U4AC??Q+31BE$uVC6Sv?(`v3^{MaI2A$~ftogMb|AaE&UY
z!2o%Hm-;h|Mb&?IN%l$dAF2h1zS0J=fD<Z*H+NG1&6iW}!E@>O#_hC-I2p#?P~x&#
z_VrvzkikH4S@N8j;~fNmqJImJU>4v_);cy1ur}g)+OTcnKmfQakJIrpx6&;f?lpuu
zt%U=>?9Za=+i)6)<CR@;VB5$3$@{5xOE-rc1Cp@6$OcAd3XP}PV3;Fj%%qo8lgV9t
zh&Y%dq`28ni{rg?VTsZVq{de7Bk%qTzw~UqlU|RX=K%95n$>^uZGUR<>Yb;5{~`3q
zFWl{?AN%*W)5o6MLvEr)_{B4Y2G}3S>p{M#@(AKr43MVRj_*OWv--lh&LU!j@wIf)
zHk6(4CqOG(Dg#a2<2om&)yBqUp&}#zs)s}sRbal?%+2?DH6YHnv_BhtDEr-ipD8cu
z6E!a!Z;>br@TAihLOB6*;vP^n8`}~p5_=|qf^D0rU(*x%dEOgsAc3xsyk_`wx4m#Z
z=D;Kb3wWytv?Q946wR2=m$E85h?%N}jkWgJ{gDdDT?TNCvZ}y_YA+IE<Quzm;f?9-
zy$djsME@8NLpxDNXR?d|FtN_GR-)<7glKRZ(^)zk>P$-NizB3_xq1fxb0sZ5dMWMw
z!i#Ag+<Z7h*t&{>{N1Hd4n5a=7lgSpF*R9#1JN|FAGHw^!xa53sHB+1xr<E3Ymgdl
zfAER)qaVAQ?ykbtu>eVF0d^ZnCJKF2AGL&3t&L@57N?YF(_svWNsk;yCXV%Mybho<
zJB-bprCuVSGiS90yRHEQ-}>NIdfSCodMF*HOHfH3q>Xg6fAY;Qu|ZxpKg|K=Q#6&g
zj&6PXi&w{Q``ELG>BoQWR(kw-a91aNq_cqACKjaJm$g_)xv02xFEXukYh(}=0~F(K
z;}smf5k0#)YYcY=&52Y<2GzbUrTHiBz~C$^O6m3sm%B1y3=$zpCmLZbpBd#Qz(fp!
zHQa45w&krK(a}(|Wo9?jUAzZ`gBEOuOj9j|$}CDTAT(kY(o8kLnR_jDstsz!O97^^
z*D=Wmz1QI2JJbDIx-s%qsn|%PtT5x61BBQFYlDF%(;)kt1OTa0SNmxW*e>1^P8JcL
z#M>rmX|R_*@@9B9JpzD%x`;uGv=Ug53M0)y43BV{SAb`pHbKp7rwWtddQQz7+l;ao
zJBw-O>Fa6zJ#R~k7q`*`htreur*nZxxHS?ti_YHD00^E1!`x*3b8CLmCDnH@^0V1E
zF_P}cbb!8q#FBegUrf&|!Oqg6Tmvae#w6Ibuud#ai8dZGZN)wpc9~`id_KL~@toa5
z>Dxfo(lXI{1__2XF-U;NhcNIT-b%-}PEsFXmHTU(>BeFId%o%~|G+;wJ%F#jD|3MP
z6wK-G|HJ>oxBk>K+rRaxCq~OpzKF9phFi!EyMlBPEpR%rswxuLj$|cQCxy5zT+xhB
z4~6bOh?(!p;Piakm(vtOHDv^&W;P2DH8-{~lA>--s$EIpn=1nczcV<nVOuE#FDeUC
zJ4--Htw!{V1QgzCHP{5A7;u@yWaDD36B#Z*R}(_vLl8v2&xv#c17jVOTU4%|jq-#Y
zWCW^9{G98n2+<4yF83Hw%66a`hXR}LDBq3snhSeIL&f%os?KkY9?E-6HL#tOczD07
zb!a4TU=>G>Y|GCh9sU4zh*jJR@{ONKgqgLJb_zuLk9Wnr%U<a9yYAE%-^>Q8B}=U(
zyKM<Xeh)6h<2#3`0~<{1(i$Wh#Kuga>N-1qQw4`gSV>h;b`PFdLu6MzL6!mJoJl7*
zkDP&*EyR}aK(RIerVEDcO$*EEEsLA!8u~cwt_Z};{g|>F!Nw$+2og0YH)-7LpSEs_
z>MqtoY=qKpE5pW^8i=;rka$>v01Nx%X^ivn)qOY~Ed9cT+k>x}8MW8#*Ezs^lBf1-
z|M2hps~@~^_Zy!1=rR)aJ(5}gmNlFo9%>9qE6k}x4&1aH*xWpMP(-niVf$3LdlGi0
zV$YkvSX~i@2o}pvXCfQh$T}TmR&aHS<0jyV#%2oH<ot^>LX+M^-Imh55Sq}j1aS{y
znxzzPRe*+(?7}VAM-j)=*{=#Cbb1yXdsT6z!pnUZ!3+r0;FqBYZ-W`>pm?8Ivl@VF
z=-gaUWeU~4`lTRCzr7*xu}+uf;<`;Q3p3COp2F6Vtrb)0_}US;aPACvX>-x<krjeZ
z3=Y$65b6UIxh3yu2WiEhz&ndXnD!jlYPv}6fBIE*?#y3(J)*2OIQ#A%a*AEsO`R)i
za92K`j-P!xEp>6qWdo$k%B;sS)LC>pZXZJ`*hT7?^BZXbDQkL&EXr|eC8VM*_Rj+V
z%6BeZNFP4IX_tMRM?(o$=0bXJqNffvbWv6>gxq1*^mUN?6w<yiLK<x}TArUi{^4LE
z=1>_pOhV$A@?4l<XDwZ89i%4@Ti^CKfB1X+6rM8%?R9o(4ltjj3H=>^;J;rTZcV=Z
z#?I(#pZI7Wy5ToL`>-tA$5h-P7cJdRERvW@(P*i$uSy*0+^vyGBO>7r%PWnn$JIUB
zq9R=8A!hr)jFh^ZslQk0WVvkgie-;PS20d*BT*4fzfVa5{<zU-C*1WoR}zwlWhTnb
z?q;{IG9*E0<@##-x{QbDVomH`n(zfzU69Wk*AO#8LFbs`<+}r(J)mpO+3HjL<B~@n
zGu#?>8NU`S42dIdE&Z@NvXsU?y0JgA2I@DZT?Y@12<!IxWiK#qGP|Zi@WyPv_}m?)
z-J{#-E&@LvY(t8c%Wb4_;W(T7JTn=mKI|9)aOx<*(%sMNxjpsS7YWK)LhQ%jnVV_(
z-4Cb!!{^hT&f{nsz=6pqB{le{<hiWok#$llOeuzHGUg_`unA*7Y%Kc-YD5Ya#92bk
z2RC4Bj1|P4yuG)cE+7NIVOoI{f&0)$f{XqDNNSm<h%wYQDMl{*`s(-i_p)zry;fHW
z+&LbxOPPwt#)he(RO$(i6X>qq{lOppdw=&OhwF9r;~Zc<2^0GMKk{`~Mwbr$k2|-A
zU;MG3SxTM3Td=s136|N4pgV6NG2g-o1KX%#vb}Y}EX$ByY?+#REGJ16L*<LzS$+}{
zx}7!|W_wzjMU>DClOYX=IJ=ZncRRz0M0=-_QB1wZx#5^*L;<2k=*>qgBrYqJ5}7r_
zIq{&%CE&tqvDOhxb0%i~5;V#CVsu{-IXMo<6?eLztz4*k)wN8<ZS(urP_fMno+_S&
zrdcO|0BXC@8!7_h^huZ0Qop6#6GI!7l-31d^`=(<YrmlFnfMwJRQ@m$IgXu%I$)Y}
z<TDE5snabv9puynIB}j1K7$Bu!^z}D-2V=^@j_Iwwdj2zfu)P`F>;diHsBiUL^Tg^
z0F9lOeCj0vZC)uN?ma}d@4k2|K}Vdr=hoB7)>0bYg_;buNSn`?Aa8%gH0T%SjusJg
zxsN28lcNESS)rN?URf~gFTh>d1QEpE;Qdj!1HZ9-oNhv@Ibgmvwa~y<i9waS-*;SI
zL~@1jIW(rvjykaVt*5lxa4Ze=3|x4J*>n{Aig}@tpS|Ki)jTx0HR%3<GY9H*d~psi
zukqym?!Wl8pFcV`{O+A=M{j!UCl1r-+7=9u5R?1II`D}iGtac64qZ|@r3|c7kxy<A
z5efiM>?h}}%)5!e!5@r(+kSH-(3_4q5&(qGo;jC-vMOND%P1-@+e<7)o@=(FvY}b`
zWva_|+lI@ar2ST@e)M}y*i~RA?!{@K47?>;EQHwl6@B%0zdhuI9F;;2-5Ra&3C`1N
z)h{swA^AwnmOD^JI$C6{cLTf1bl{z#d)WvdZPPd!5T3H%RQH%#C0=Dttk#*A6R(ii
zG`3<)EVwVq67bh^Cl_b?T`z)*0yXSD5<&9U2RI$_v4j2eD^@pB4<|3`)KwlU;lQ+r
zSQ90^<n3&?i_NxveBQHDN^~*?ic8h9M-yi{bR+<{9N&WU0To#n-c`ffH&Y+($5x&r
zTMR*^)-+oT3PF_wl{T7me0MLcoI3~qWdO|au@Z$Us0XR?yI|HXp#N{ewGWMU(h;&3
zD-(jWe`Gf7&loYr!*E3z<7D~yVjr<tb`OD4j$-ckVta#gY2%>K0!fMhB>qM_7bG+$
zRBg3;|L}=l`mXPoo!-~`_c_44hBN$)fBkE}da^wI2e)qDS$g6_hiUg}KlP#G9!y_=
z7brNbg(WOT81}JAm1s|7h*Lnnc<OYio*?|dxJo-r5N_XF+P2s<7C-K41<?;9n6|B*
zpu{E$LaMD}uOPyiW30$p*l*$z<)mVFpb|UksI4Sse9x~L6_-<|Tz9l2H!Zi_vC~1S
zG`+fkfTFM5+|eIVzq)hPtU6qXuttedUVxVqka)_8cr~<Vu994u_uWWj5d9#S1~>Ki
zn5wdFSF^ccQ?Xl|h>MU}do5VhRbD=!HZ2aYndljys1zl^<Ojb;uJNOUQ%cg~yW44c
z?qW)aRiQAVk)lKm`wIgUCr5BqCOFLoMAoA+3o>Ofn5v1kMiBAN5P^u-;I90J3u*0<
z%W3x)pI5)~+}Psur+uDcYO8h5GiCO+@1~Utu&*FkbOG+lZ6@wyCTQx);L^M_Jx*_e
zEA!gUt#rFPz!-3^*<4HJ5U7~3-$vP7mrMO-q_-EJv$?eSjF08_m}8|C<?=)f%;Lf^
zq?z^ay!O%m?RT8Hh+gLx=K%8>P4GAVo!|ELC%wtve(~n*wEfI++IaziWyi>-I=Dt3
zF)VyCLhyh&p<g~eQy(m-zyg=&j_o3oRJh=6bD27asZD&D<K5+y@_WG)QV?#Pd~+s}
z`T0V&ZKK2wq(x!lVFR52vL*Pc4OnxOP7<QD3SOL;W**))soyBZSbeNt1`xXDbeSJx
z567MH-QFwWlb@;Xh-<Y3)~wWsd<Bd)6h04GooBV@VRG;(a6Q`(8%K^+)s;LYf*=|9
ztueYck9FPCkowAd{cE<VL>U}ER?8J2py7B%oQ6S`Y81e5e&Hg_^Ggs{@*IMH_ZB`a
zt&`xY?Lrx?C3<BkSaSmD2A`Sd-^PtzXk50{pbBJS5~KIHAoZNN2(Ka@W$$U6a``!L
zO&vt_Pgmg7i{L~CAKH+3Cxmo-hAYoyRH_B>AGS{F!H%+v7@ZLg^6K&sGN{R>IGC&h
zudx#VCj}X@V2@zqJv=^4&mHfj0nWBr5CccGERs&*URKv-c4O>AdV5KY>;o<3vgZpd
zdy&7=BaNNMKIKsA9+FZ^)1$O7t<E0AmY*E$wcmgJ+IK%wuaf)e^BiDagV}!n-}p`6
zIBgHV_2$jpba=aiv&h%dBNzK=2jcO;_E8!l0Ux7#nQ7J3=2(dE**D@(op_MvB2yTb
z%f%dqL)&{sij9ET^M$nQZeas|=XIsjRS&Z_k`sGZxT!RK=5zUO=hK3FW4+o)HdsN(
z^tmF+xjp&0LfFD!T3XvA(HUI4OHoAL?wOOcHq_-brZ7Oz3oO`^<$^q_W}zxCLGug<
zsgL-RnkQ~OC82qnoUJvWF4t;xWc(IV<g9aoeJ_nT%LY^nizc-=V-W4rBC!!NoG5C(
z>@0!bM_frr7xVPdNxC~ZOb=0?ihxBDPSXlmR%Je;lrd0EnVo_6MCoS?SUo$k>+#R5
z7hM3%7!m!`J2<yxqc4}`G2E5gw4+cxOv$MMQl*|!6YfKBWf#%=M>~6|d*PgdE+>rY
zAM;Q##`H{6AB-8~vh?=;YWmsTC(`Z;B^xHiv{Xz?LTOGA%Iqia?BH8$&Im9Z2pbCl
z3Z*arZ3lLs<>i&M)L+7p&L=;OJ^t9jwDt6Obn-Ezulb1^H~#t4jdgjSewhQzf8fM_
z{onY_UwhJ9_||LCB~K4l(&oaYw7A+$mwVW}IB|Ofu8IqoDWMo4O#6jY;b46xjD5j%
z1yDL8E2FIFgM*6`5QYJWI7GCtb9!dD^-dTi^r&MJ)YHb-00E<8_Fzuou6n&8Zv4yu
z<-nB*o->-UAu};!dmAK@UmWDTnHlWLL-`(Ek|Tk&B?C!=j?EIa*%}Ro?N!D={BItt
zP(1}GSB>^Da6!btJgNx8?rdMw;II%C_-phv%QSYMnMiyNG~HQorn#xdI-kAZMbjyw
z%%w940V?)`wb}lWq+#XRpOP~wnPd;PmvOq)f{l=b69n29g(MHEwQwRUB!yAx;-pBu
z0su-keKe}K7%*Rxp<N>9-#MBfCggTnKKF21eE32dJ^wu7VNfiEd9gd1e3M=z!344A
zu&6*u8a+rP`*(NJ`bDV8kX`wNe=z)apepNP&-IWRXC3#yV`($JJ)KL7=h2TYvz9R!
z5_^);4$5H$0RAb;UpQiI5T@{VPt|0<(Vk>%PruhsgPrO7KK$Vy_}<e4Fu%ePm;=mz
zz&iL{fAzP#=cGURJJ)XCNS%ZAbh-ObT3gzH1!cv3<e`Gl!le-t$t_hE00ti@q6<rW
zvzkhyvWisPu)Iw9Fz+EzDsqhR;KH2%hUk!Op2CtBg49F+kvjVeVy8Aa0d|F45PLO4
zL?$?T<M8ZKO>s<G)20;Rt!PsaKQD)Gx!ig_Du6jF{-sGo15(%@kB7;XS_5T%m!pF!
z*1y6&zc0pt_hf$uzg70(0Ntve4H{cZ4PHkk@=h}webs!Hd66>6K|xo=T=;y;MVW2&
z%)LI|0!?R1%pKW)$>z&!HB!mt<>4w+5pWyqb%4i<uvd&ao3N4atRe?W{7&M!2+3u5
ze1t@j8#Y!h!p_3H4XBT&e)Q21q2_M@G@jc6m*UCQn<;(zm9&KH$~*06&<CcdL7*q4
zsBB*%a}XnJ>)e3HDp^7*oBh2boOKU186+)cT$X-{YA%8&z;Y4YeWbUN-o5rv8lOK&
zJ3R(ME?RO=SBaTziJ6v7Vu?T78D)`}2}+}?F_5Fh;OKWg`nLCe-?NYXWNa84)cH#z
zFb9}Va3Q?^FaMU-LcjZ+H}72Uq{B`+*SVO^b<d?`h|X=evoLEb_d^Sx7qD@R^D;uN
zMXJ?WbiAP&iM0I!91#~M&aV2YH+4868$l_A-tKX(yV|0KoA!Ij#flimy;POC^MP+B
zN?JjZW2h_!px77+cklEk15R^rjXo5TY@q6NLp?oBvdaAxq48qt-|DhMOJ<{BV^~y1
zrcM&6sOr~ug1+C&Gdpz6@5`#Q3iN7qlL-54yDIQ#08x}FY1G;g03O3**J8X(;t-HX
zV)bL7^_5aEf46s2`MB*x^;ojY3_NV3@@lzV1D{$rx8&wNvi?E>SYp(r0KtKXIlurg
z2lOP<7};|Tjb)`0wIAzw50J68QCjXHs(<Ij?UayVxV_m=3s99!wrM}1n#>2?CP?d1
zYpnt5#yl-R1JK7a2Z)K;eBcTq`jM>|`)>-{=M?I*1-L0Qd+|E%d#JUXKHeIn{T|hS
z_zh~Zj*hHmvP|2Dp~hwKWs~KJM%lhm8W@fr{$Dw_^1k6{`Zv>K>HT%D=bvo^<^b~v
zu7u&@<c|&y2cL1U+e+u#7t(5LMVo(s&Dw6kAPQiaGK(yH;1?E-JmfKm5I2nPLQ%td
zJ%`Q3?ILxIf#{?WCFg4^0_sH5e=!a`z{xrpwcF!P+(>>!Ak0Io5ZziA9?$!`Z*~K~
z+#FYHm3oT(qztlg=H8D$7xf~jQj-)Hx>5s74N}%Kz67`$%R=m}s}3-^zN$UW$%iCk
zs!0;_Qm*dK7)3C4;#?8EQeu$=8vs;p$RKOP)Z3oX$v`1I_Y8O(fE++<G97d(KnXxH
zpyB*gM*f&HJ*o+?atW%C8m__HX&3Ivo9w<A2>j_>Vc5^9iaI{bhJNEcP=+%hGl)|4
z)CW$>i(_hK(uQ$@gq0)MVt5Ey=gLMJ;<U;RCH*|jvykj%NqVr!pp?>s<gx>RIk|#F
zU&x$1WzZ(BwNMvxtC7GhkB8|BvX)<fgTftv$IxS%ZQJ2rX;C?z)^y=<b~m>%wC>t!
zCzvzjKxc`<`JuP6@b&Nf6~A)%Bmd^de(Ox<@B6QFfcXTLz}Nrf-@MscZhzDDYdo3n
zT-t13M09aae)w@TE7P5aM;MdeS&RvnC|9%D1wzSWG&k%Hq%F4jZ6AAXDF-Aw33Haf
z7KN5=L+%EvFROZL`mEpevsz;9u-KNUS!HNfo#9O2#46}xZjFD4_<$JNOpB~a$5VCM
zKl7q-he7z_hlZ-k3v8<5?3vI{sQpqWg^i_1a=AU3NZBmZFT*0cR_ZnaWAjmu{mira
z&4FG->8#(Q3`Z;<1H0E8K5|NUuP+0FgI-S4ul!sjh%UK!ou+2f;Ur#Ok$_CXsp{Y}
z<4JmI?>OCFKA#?ibRj88F(<YrW$5QIz;L-|vPmBZBlI=M)S_b^)L?S#P&Camh<pdh
zD34wQz&v<9EnePC#~;6)mb7eb?Gp6(YDLhfCd3NW()FRHYODqD8yq9UGO`UUECIY=
z>~BGWVWz(}K?p-}5l3Sz-`EEOI8KN3I^$Torx_&}JHA>w#w3}{^Hg)_^Bb=GQ-Tmw
zdmPjL%Hr32#+QD{4}9o{e)#V<7Sw(IWezZ(zycU|reAxwx4$%j0ss8sM!L{jOFgQ_
z;0Z&WwQ@MoH504Wf-0dhqaA`lY|I|3jb^Hdpg~%+L6o=$9Rm~`CPEVp74@(`T`TsD
zaz~{Ot_U~nK2}x~<~vr!*!HsDoDkUE)~;%tbdNaQLY^i0RK*oi5X9Na(4RkUEw@1^
zwApLQwAy|Tbzl&gEXh@4XixgT-j5*M)}H!~7*jUJ$KLPB-SEcd!~{MOF&3vJl#D_)
z#v!fwC>D^)b7x~1b0Q5GTPc9F`HgewRi0E|R<^=3!wX8sGTPbirYd70uqn8zEI`G|
zvqQ%M$mo5t07J!SGDvr(kWvIN1P)r<Ju=>p7#_isj0sxnK-;n`@`U-+7XVfNghjsY
zu}8X4Z4IvNAX*Y<xLjCIhY0wb!rPJwD+QW-s~4S@A^VE?y4yY2XUpAQI=Ic)nunB=
z>;#7V&aeYlWw<LdxD#WsiMBt4bTLcAK^o&YV+b_1rewr-6&zJ#t}3%IGgIjWGsX!q
zX`j2=)su8@hPUV#uF)$itA84o=KxdDiaEekgYil~_WLW}aO>K3T7x_Cg_Vu8vJ6ig
zWO;2vmB748<eHR3Ptpc?CB_#pwgsEsdH5_Ls8T2VE;U$6mmoaeD(4Ox&=43?H9P~v
zOlL%7wCBsNH5HZ_bo132t~(6A1`ZYRpDjNV&5DSteZ}U%TOuWl)f}YUOLhX-gfmra
z+;T;58qxuYZ~nO=bY8zvaMo1{$TkS;Shh&yZj}L<85{3kHBOl^@VU(28}-{*WKeQo
zG52SavNT5-m<)8ytI$+`5#JI3WWYgCF|gEU|1MAtTMR!_+F?oY4DB}fuZ4FWM7FrF
zn(pjR(=m=SUXnz_R(X6DJ;?wvG^VHZ4YtM!?F^{gV*_cfD<*>9dtkH<Z|vff%Ry>w
z;oHTPG=BOpwR@|Y>lXA>CW+J2ZH9AkPDa0D9Le16A$81jgu}i1NX?II0aN6PVJLu(
z2WM_Ffbc^|(R2mO-UKerBm7M$xp1EX+|)+m3-hn(q*-bse3jZ8{w#KlKblIF^Tn{r
z#dI`6IKs)`(f9m@FT4DKfBrA8zfxNAdcSWDFt2ikzvj=s@6p5K<F}5G(e+Y$Gp%Do
z^w#>yv$V)yF)UUd(HdvohewL4VX6t=6yY{1ZhnUF2O+56o6Y3LwuF3*mxz<UYT{Pi
zB<A+igXM;M{U(l0q;9rS2N9QkawY&Z{^bYvtomvooBjQC(?deCakMeBf9mQ@!m8~f
zA>}#nR3KB=A%Kh_H5BkVDzKWl=C=E08$@9J85Tq7uL1f6WULP%Wl(Z)?Z2+^*@)k;
zGEL=Gy|XUe6Ze5IQpF_zaz+E3J&zq@*a?{;$k=K7_yorc!)AfCojw0Vb#Y)AQuixm
zjHZ>$6NxO<w_^3nM2DIz`dWC1d>em4vX`_7yc*oxOG`NBaS>@_#?R~~c=t;^M}Sc-
zJpv4yK-MhuR&G<%%+ag^wOt#rJV%E|VD@lM4E7=ubheSkzYSn%<9@m=Upncg2a$X2
zF76*N?gsA*=(_^{1uT{!nE(@kl*)xEsV3H!?F5~|ivKvKV`i;JhJr(Q+V$6$e;01}
z=9gdj_s1P`fO(bkjJ^KOedKO}J|eBGEvNo+KP}-%*+sZW@~rIzp7l+?dRv(dq80#S
z>G@r1IfGzjjHoDGO=#`&3|AL~)yb+oYWEewThvr%b^B~$Z+Av*vmRwmaWe4OeWh(@
z0#fdCDZ6ps3{bK-x2xx7M_V&J%T^e>r2!HRpo=d7Og2UW`shqGI7JQS<JkbI(;(wQ
zsEtR3_J&OkPW)BsMV}AvZ%95dcOh-CX2q>w&9T2GsTio}PhMnpJp)Yum`<?7a~WV<
z$`N5SPbi+gNHE&Nk;6P2rXD~6GZ>>TB=m!Qri>|B2=XB3={|d1+L8#VoYE%61K43e
zxQ}n#NlS0Lkb0Lk(k>G=BjKh@^M#^o9s4YmnQ;k%ZR4vCm*oQ-M7FZBuH0m8B)>dq
zBL^6nJ$iK_AK*Hqmq*6EbQN9PmO2~iYN*qwR%37`k5iWPgm(iO0t+P|_x23;Z0AYL
z$pz{~p3I5*CQM|T(wCz-bAa)gngdKlOkT;yg--WZ9pNmBF7>%EDlTF%cc2>UK;+gr
z-vAg3+$sQ*-C|>_cp#Su$4rLWLKq9j-B4>)hK@4<p{A^75X!obMXgk>&X!p-p!bq)
zPD>e0(WOb>Ny#e!5+}$@O~5sfuqP#Xi0=8$)^9VjCC{<As1p78nYK9aTG>s4m_6@8
z+Ngfc;GkyK;AHJEV(%d3L5+=Z$)IV<xd4a!Y;{dNAz)<JW0PEBtq4d2Qg&Ye(G)39
z%5$b(W00@%kP)Clf>G)hS_4rI-rEEf5ZU9EQ92q@q~UQAlpa7_xkz{~*7y{uWu`Fn
zw?NF<2gRDqs;}ya(VY4`gaqLB69P;R1pfH?4uT_xX<@CG+UKAqyEcFT;#rKvhWT2_
zOC~<Vbs{c|>`;^K?(QI41|%1FlPw%VU8c1BkW$cYN+GK_PWfTjX0`^SbVx7(xSYTq
zLtoD>+e!~K?qLFppD$zJ!XYtck=KK%vGu}?2#{c4#~LD*XnZhv>noun_sczVfO(ZO
z-EH^2cnDR&CW6uyVO(6qVrB}BHWqA$oDVif=yA=P$_QkrX9|gQPqI@IOi}qn(_W6S
z&l2Z*-9Ot^5~<=2FiWxR?#Q=kV0899ued(nkfmRxMMbj`Hk+Ao5Y0KwA=lE}Z2NMa
zwNo9hz{tB~O*3+4pS`sFd-UC&JquK$Ev8-)<<BElOG3&e7a-=qQiGJu3ju_HL8H9`
zhchFRW@h4EWG~<xNz!4g2L2GKuJh~IbeDcaESbh1pwfn2aN_Q`v_l*(w-b&}u^eMM
zXU-9mHbn+8V#yAX|Eu4FJFz4r+^2sE!(ezoQ1M_@wn@fvUgo`aMiHMuzOJ;ev!BLy
z_u;O*rBg0PFXA&-rOM<O#^=0hoX>E?a#W|*U=f;u1Axr>);XxE(Fp(s_hTCt{Wk9F
zqy9_Cw0sHRa}%C(19+1u_LG(sI_3~`WIr_<sw5c)6n&ROqwZyUjV!F}6C|Y7$uJ#^
z56^$jmw(~X&wc+-9lc7@^18fz4lu8BT45IbbaI0I&Jv7^02qJ^r5CEmXorD><N93N
zbkg6tTk=_4dJvY-r%0V4LSX=8uhRsr`l%xOMg(gq=&bI=?>bZD9-tBJQffngAe;nh
zTgGhpF9kOBZ?$xCuPK<7++K;q*0i&n2Ba9zGVU>6Zs(X8k5hF@@bPi32D=8O{A&eH
z3X*gbEnvfWZQ6xGB8j2{h#DSY0vKBpwOWjHnNkMrFFe77AY@VuexQm>#(TfU+(r0p
z45NMvuC@%MAHz?WYN`Pc32!2#1Ugw+<Vl88O`yW+yPE8t0FGyJ{xwRd1V0n4OjYJ(
zlc4wDu6&GBE(tJ;m*Gv;dr|Er(bjW$8_dk*Z1!f9zk<AAcXxKv#)S)zOdy>AG&=y4
zF<g_oSnCsl0D@p1KJKLFkEZDs%r$a#AV7qKA}NLaCBXPQu9@RbWC?I-M(Ss3EYMEW
zHJlF85nQIbz1BO>#h-tbq~&#a`y60i#kBs$Kk>fqbUH?6JxIiDsK=lx=t7rEFBSq!
zSE(wX|8?(_yvgKw(v)QGMVMbsS~>NcxV(U;0!THppH(*jSAj58Jq3W>V;mwrz_#Mc
zzg{W8yb@4p+!F#s?pWyMYsffVYMeB9HqZtjyE>BCF*BH_0j9{HHjJeVfy3+=3y+*Z
zC}Nt8x=T2x)MdKg7?c7qMe4~Ci~z<##`JCmII_u@@mZfq*p+CN;NrF#smUnCFd&hJ
zZV*dQR2;Da1NcJf4@fw@e{!ip5?4qr#|H;#4H81;VruE-tU+cO0v7zyV~Zn18>Hv`
zT4W?X)MV)h$uu`{toSOtymnwaK{^)`kti<2zy@MFU@2D_`^~m9rA;?Y4tJE+hN|n<
zA|hdi)Ow*W08p0IM?W4!s+ubR7@Q!349m>A!9VU>N-VU=;2DmGY%%V%!_Q&tm)guu
zzSJJaCh0=t8{^c<-TvLQyuSRUsBI1~7=$^%ygbd!cOcGSD?tpVz)u?tJ!`Ze-Hp=>
ziu~hZE`VZZP8hM^OPtWm>*dr;r&KahS-H-?D*LSN@L9J{-m~6&N>{djt=(tTQ$W=2
zd946uR>Fznq1Bf&>`-UflRnY*oF!?+z}@S&0D?&G-jpKY1Lf+i8cgc;g+#(@<=zq-
z70=c0we+IX6CGeA0YCz7x&4Je64?z*iU}a1+bsYFx$Wh&1;V+%j7WE+Q%U={kK^p^
z<`U?^=6JAkXlg4;w)}TiMRsb1RZb=$5oOQ;kp_XE-Z@B<yKqrP2J+syH2@6NWV{(|
z&`Fe`12}7SYsAF-a4+KgSY=pVL;}o(L)u^nFpRrF_GR4E!}~=<^gn=<ICo&P+M(J@
z_EW045kG@JKEtv)P_c1L3}gU8vd!40v-lYU9gb}mfz@!4z|jFHYi;#c<F5bZUah^K
zuFnDHRm?Vs*v3SW^>&mjC6+c9rxsx7Zpj7N0_4l=FQmNu$AG0K=Jz7(FRf0q;xbb|
z(}#Nz`I<<-(r(v*)Ir=|4h&ze>`&zR(``3NBDaYI&x40*npX(D$hn$!FOWE`UMoE*
z&p6s<wPles?Cwx!&a}@}o(2$F1yx(j3ruxLsxd2!XuDsp)nP@d2>_yiVrsJq4e_+u
zxMU-+V%A_RTN(SQ77ZX`3<m`Z2LDnShTjg6F!=~MCndhHo$-#(jG8B)j1X_qMJ8dn
z4L1k)C8pl`7G7Vn7qNI|S02DXe)Dcxy7U0j_phhXGka=3r4puN2`MG!F2~1sZ7fG~
z&Y(-Et?%L>vb9TQhiTCk!}K%+1eDf6nX8EMzl1uUI-I1VwM7L$TY$9LQF*_C4J8zd
z0WyaHQ<(`<xaV<!nWlMd3S-x#y$lduOgI7I(_d~6+%Lb+0p?ZAG~MZW;-su9V23Dr
z_aIu+*x$v%7T}P}k*?N!>jO|`IsI8;R)4?O=a(bQvw@U#*D`@NB}E3OAeU<A8HiN_
zO!Va}0(}-}*7d(!+h6YYN*SM3CjytKqym>B*Ue6@DfwP0P)3`YwuXB2SXLQ9qqU9-
zUQe`OR&t4U)W4hSu^`F_zP?-ewGHMhu%Rjt*kZ<iYQ!qHWQr$}TwFaSRT_T>&kYj{
z>?*Lcj9^;F35*A8gS3ZO$I)#hpQNXNCSCFY)-$>IDblUL&akwMn|L&EmSyg-lVVZ1
z?L}HISDEn*|G|Gu-egN}z6>=PQs4vRP7Vop1T=Vn328lnn6KUN0SJK8R+DWMs<qv{
zG=#Up;u4<U=Si24F508CNC3uyhUGL}K+MlYI5Z640C33ts`xd6i8%%^Jq|y!9!fWq
zXb2<%9rrBj=#VtC*(hyHdjfZ9oV3{)JoqZe%Io^pIl#QS>BV|?H5WZp<UdbQ8jBH&
znCZgF1sTAYp0<8vPcpy$QiQJl?G>w`#893_34u%16veX4>dsw_Hrrk^8<aBd<+1j1
z05eNaUvAi68gQaB&GC&fD{aZ-+dr68Q3j(fCw{A?oFa8L+aLFGY(m``V?-C<<{Zp|
z%e}rA6O99*#@>$ijkOcclH)Ft45b}gZ;@wAW1v%x#p#&jQUMH;Ir2b9;RkQQJD2u!
z`6L~zOwtexR9AKjnM_>rXvq#UI66wJTfV;K<;8*u0NitKxhEQH$?oL-ncO}|Cy29I
zSb>My8o&<HMT_bC>xJ7WxXhzwQOt(W4r%;*X&*5$7aqDIWB-)7$p9J*-t02K6jTq#
zoB?1?(wz~q)rcQ7W4(nMm^v&x$LPeP=jrI+Dl?1Y@o)q@PYGrb`3SohpVS`UpZSX;
zaM~C!AD5RJ0i^S4@cgEW^??*;sXwY7?9W7eZbHkxD!&}R&hPj7i~G%CwcKZA@>&(?
z?fUnZBFaIG3Rhl?#`k*eUWD*ofOA&anZ%y~e07^<KhI7=oNH3vnaDAs)lF)!sbRnP
zn~gQ8tlekV^tn=%S-tW5=`po3%JWu!xldHQQODWyGCThDxWu^G+6u;`eAfDM#>G^w
zO<;|(csc*(pY47#M(fXSHZI&$K(e6|%<#tpx$eLYb3_uUISX$t1FFSENEkTOOTRJi
zmx)~Xvm*iOS3`|^FIAd6gF(;_M`;LmWm<*1IOTGJ++I33T&P?6oBJo7nE!FqyculF
z^3qZ|-bG$DoX*+eDV&fx+IUw?3w~!F%@wE-FYHXx$|0N!sG8uoOB@R?H@C4kv&eo~
zE2tuKuQG-v1Y;RYCdQ9o{$Kv?ubR_VjMiL&c@=<)#XTwKM$iBm+@USKAR>euE>kU4
zxuKpG7%b+(c179tWshbWB4)YTxUtv7z%|!!5ZJm)+JiH?FBQqARW}+rJN>F@*;$0`
zwA9cv!oSq;MtR{N+@wxH2{ZX@{_HU;zULb{KjURj_sFA5Y%)0X<a*WVa&M&KFO}K+
z@d45r#dl;PANCZJG-h8l5ZB;a-`50Ig<%jXmAlT?R5A3k1|}?3b8urg0~Dtb%`AU3
zT;!Vo)q%%BC+1xa2^s?<tV}OFhWFIY8jd1ffkd)Pk%w!AzuQ<5U|Q2XWLD-e#|&ch
zD%D!Av|dShd}BUi;Z;&G?(ZYYegHLD?@^@hKfeY~w%Y&*NHH>K+OwK30<C?S0W=>M
zG?E!LI#Bx^z-4)D6~4$FWFUYy8NI@&_M5_`Yzc>vog18_t-Vp&#uzIpChe*Eqc(yn
zGP)|eR3oFmOE&T@vttJsRL!xW<U{7K&0tL(WOl;iKcR%Xzh85Jc@^`Ig?*qPHO3N9
z<)D4(u8fKk@F?Km41GfzS6Ww80I6Ah-um^f_082FOC%YmS{`)-K@=Uz`?xq62f);B
z$0D&!22|zWgIRtRqh^HRrCuZr2DTLv&wN%iMlOc~J5yr*ZmJVgDb+UtUev)EE?ZT)
z+n-B?QL?pLzH>g_r=RC<M(aYdFbStw#_2Jr0moEZW;;<idY(2CR&9MaWrK04bv9#i
z$xBA}W2{r?!Q@reVtfoflMv>VGDy?t5&N8IS{EnqtU5wO|KTd1g%lwp9L5s17bT8_
z&5<elkKnGn+TXIAWVDfJ49}Qn2j^MoM)OAx1AbuLP=`%#LxMq~%GN6EC^Wv`<|&t^
zZq{)UTDU~K*}1p1L#BewWl-gM4>2*f?xeMib+{`d?-+8EEsUrJ!?>WJE`VSKfV#Ci
zO6w5F?yNId(!hnz479s+XwaARLxaD0nMrl#c9^0fBWPLRD%(hV%tC7lBh?&W<^c05
zNF|t--7ZsRU;&MlMHNH=>z_JWolD`KWKt`*&pbDfWo146dO5=T(oH)}?7|>#<+4t(
z!%{UA0%-eFp$&BfB(%ZJR@Y~Z{59dMC7PF0&)rM<DOEM8hK-mgir-Rf8?V$R6_R1J
zt)0(zk6NSrm>=2^tiA%$vvXI&vJutN_b4yQD1Ew@G!qh_Nir@8IlyRs9AIimCudcz
zK!&&0F335vvK$5I0N{Db+?$N$F&9~zF3fGLWE!^NzB`7|d%Ds}dmB@DlpUvKIdKsL
zoS}k5L3ib&{hhRWZo}-20+{G)VUo(Fvp5${Z;UM^0N7i0aR}KRI1oIzkvbP3#oXc<
zF>+Y0m=7-0n5Nme@pX^(EWzpK7QD%hpw8<<(wYv?P5=b6ds51vIt(z9Hur~VYk!pX
zFRj2xA0A_2?6>b3SRx!jL75gGW6~epnj!r$DJj8)pgUKPX&=pp;iai~<Ja%`qu;v+
zP6n1pI?(V82)BoDqce~|+WjE>W*?A`K7K1UfMPoc{k{=3#w476wrK+>Hiuc8MNM>U
zKl-NjU7ii!YGIU9>@bC}p0>@LWh;p~YCBE+xgNM#19c|vVHXggY5uuP<Fbth==m)+
z%HeBe?zQoF{hP&pIPnvsd=^1cPrOMrAq<eVXN>qa0V8ZNWh|QZmwJq5KiA`PYJ7ba
zMYB$gs{>1O?2WUJW0gz!1pv(k<IwG&c|s9YsItl6Kf2zIK#;fBaZK_G&n}`6;VNIH
zqk%dL05cjMOHD>q9i<qJj1yY}n`$gg^k|C6F_!*LRah5ltI<tJF%Vo@P?Jp%D5?3Y
zrP<supYy24*MNbc0RR(V?rm?wU75))A(8N%Zpe6ss;?!~xdrKE<Mwfa&?cATU=jr?
z?m1TC+{~60QVh#7$;O`J7%D9e`8vA4&0ljH%&T1k@Jzf7J?%pS99+b-8H%KK12PW>
zW10>v)U3m?1#eoGion%sBmEwx=S+SxLDP116+nZjFUo0yC?}#t{o)2{R_*(}nI6U{
zHh&VQL0UXRgY7($@(Qqcsd1Ynh#B0x&#y#stDmttZNK_Z<@abuvz5g*V!wMoRlogK
z)QJIX1>E_BHPr}JX>TFcaV^>t<yu|MR|A-Y0;lfq{$$oX-*6h#tnt|!E01Y`=(_v@
zUhI=(?&?4@a{(p;G=`6}q#XQFIk(Mt7y!%yq#K!5v^elaTLh$arbRm1+l3T^v@9g7
z3hp#8vH|vfN&>4S9Nx>kXx>8$55sqW3xk&rZtbOIq;_eqB4l#CpH6Pkc*5+;K94bZ
z0b0S7aqU;q9>-@5uGBBwy_?RRJ1^Uc23f~I8_X^O$kt%&U)vt0l_Pl0t%jP@nF%qD
zrt&IwF6)=6b=hBEBXr5bUYSs0B*>Wl@4kP{Rb;Pv37mWgDJdR093J72p#yA8ZU8Jd
zsLr_gpzReWXaQxmV6!p1LA~AhpZ856iZcIaM6kU@_{l`qH*Ya9kl<KMoQezMt?dNE
zHec>4TlY9&bwa8K&+s(0qn_hcxL?zGlT)*6Dzmz;wnGHhTz)>i_wKbD%h<WFEkaU&
zrk*~H5!)hrNc1tlA_l?6D)*>?2cn?XpR8uIj#(dkCMtlk@wO4Dd4^I(X@^fo<KQ;~
zC^MiJ@HS!<ju-+76sSN+Z=2wd*YTj)SfmK`c9Owi8P-c%gOskASq?yrU2opSv@zJ>
zT3So~fkB&lE8TQwYn(3M7$R+qyFud?Wmg8rzKlT2Js8nfw$7t2p6ZCU(7-L?s>W3-
zq8f0|W7b$O`k803Kx)9z%@e4s+UJ*a5ZUO~5lW}Ka`aA<Q6^Q)N437oG+B%q_QDRN
znDZN30>&{;woJ&l+=V)Dg!C>vp>yp3r%~=BkJy!UQ<KTgLf2$SC}b9Mq(3DfMC~gH
zCS%^rGmT?E#`sRq_X(1^wc)UU{cnfo+~==3z`V)@Fq#~GU%%V`^~aE4_J?}_m}BFi
zcvd+Vn>M*+RIb5ATxa3ieCON7%Udy1K36%2I@n%vyt3y=JBsx_Q+Cr#h*IgEmZ*q#
zQ17Qmef9v2M^6n$MKv8>jNlq}skb#o#q$MVc;J)wF@urFWv*)0qpfa3kq{~*@Q+0b
zar*#!=wszH%i1bHcEaQX7$nbBDrYPLwb}2Qp_NvYd3OmE{JtorWNq^+xKI47kC}}|
zA-YB4v}*yNk~UC8s6Xr27pbNyRar6I8}JY$xNyQ2!w7#~qLE+5fBn%efr~KgFTnN!
z8Adh)#&S*IMRxQ2Bt5iooEC4BiNd|o0-KoIgLJU7gS=md`>@HZ%G=8@#i@NvZ;j@;
z%hCVv2*Y!13IPTIkRurT+ZT|f90!pdJ$n~t#MA+`l#p1s^-t1GX2E#b%I3(k^v(WC
zx^eq<+S){9zsIuJUc`L_6O6$+0!G*Oh6=bG&=#eG0|6iizU(o$lo`_LLB`1a02tn*
z{A6@{hK&bcb$oKHgqJM<3@*;IDPs=i0CNUyd8tdo<KrJ%?yRKa;jOfLw37~SRO%R9
z%M!g)$n{9s6co^iRFOQaWG(K7WYA>Fv_WbQC?{lfyPQXDHZ>DUXGz_wg*Vk|p#}k;
z>1-jAh6<*ex0eX16A9DjH@ccjvbri{3$E7hu*e~UClz6MSiQ<`@w+vkjB5>+@oXma
zH6TQ(RA1)L{4{HR3<QX~nr#Srb|x4#njV$Ad$WIS1Q=G}YI9?CX0@BZAx4~^ZGdD^
z^4wrjsaAjH-H3JL*#J!S)$|h#Kn%twz|pF=IbqC|qO5Rnf3=(LLW<bj9wF;7fygF{
zsa2TNb!B-a-MxJ?U4D4kKwQ|vaw5G;SsybKT>ZeRiK5&RCvY*w88Nr-rv3!!``4FK
zYZ*>8NPgLt^cDNb!9_zvsm{PY>pi_PUZb(UwRe)Xcke)o*+^X&`^RDm>=aL)1+xT|
z*)k*;+?ht%OU$z;`_L#S8SagEB!UwIGkG$nSVs)W@o*P*Ssa+w?(BchU;O^&l5VW;
z`{&CXU|!{1BOUt3`@NOBt>c@UJBQn8cd(ZRNItX#mDvJ3UYLkX(u;C9g<{2tTb2e)
zZDyN7yXN$(;bMFGo?-*=a%}IPQqwfbK2^_ItozJ0J#)H|VPkT#oVoZyC<`oE=bSt=
zz{gy3<7XSKW+StvrMi5k9Y_ZK@|v|X=O(CuMm6g3GKuRnG8s^Y`l}wBS|v8C9;?f-
z&o*BMY?V-0*>?)SkZ97T5h0;=zqP|K!w75uOqO0GvDo#<vRvK`Y>IaoJ<A}mz#d{A
zX7G$e+=&hq;~!K>JBa4L1-ry$cn$ZEUD=!_%uS!}$}3B&>CVmP)A7+hBK+Z%t5g7t
z10e`GBc<-gygG1Z=A(=gr9MgoU7TNhymO4`dt_9Gny!u1F_XJHk~B;jttMF?ZPp{l
znxvE1S2oLOb9psgyZK^T-&{={MD}-%sTPEbHp=9&(S0<w2lwbM#vwGv2DhfFW(@?~
zHsf1(h7K;!IYM_x9ShKc^l~`b!B`Gxw|acGmG|rSxfj{1ocZthrtcrMdXpa~XS6@q
zO*eLKq`hMVp24fkUM8~ZKde@9Jy?rcusJ@)c8Ts02*rO{_PJ-PYU46|?eRdJDS+SW
zKmO)_9&v8mWqsWo0rFq{o1*CAua1WI%mO(&el}V@G7s=DUe9LWSqxY=9Ay2kYOC?*
zjXxtE-nOb=ksX0{g=T+CKg@e<W-M|Wh;k*<0^1``Dyesx&#C>9kwJZlw$F@VW88dR
zqP{$aBv28V7!Csg6b5i+Z=q@|;4;R=JQhvecZN4%UNHsogYa6QEz3Ub<rZU{LCU2U
zI*g1MYYA_%L&T)eG)0$u(r&7XxNcj>`?U@!=H?4mHP2d)&Jjom%b;pFV7b*CWCV~l
zU%Ez6!n?f5km-1M{jN4m=iCxfySPJ^cg<{1CfKPy4VBxmDmH^ZQ<7W7aT*<f=Z)*v
z(n24pXmI@VlqzlXqXYF;7ai*&0F&941>$su;2AkQZd&ufYH&^23d|P7TUox;mN;kT
zaJ&nfEL7%*w;7)t{o7YbR$jNa&n1{oU}^`W-M_HDwDDDUjtA-H?#*=PXgi&oZl->F
z6<iK<$A%tyDHoKhfQs74h{(OvE!;CD{=_zAq=642WUQ9)2f@_^x2=ojo?{;?j;`5V
zwjl`F83W;Ju$tqxyG(~&@ulA3*|NNkE{!#e*j`<JVpd|yoT=C7<C(pxymDQR->T0u
zA5BGAj|Y{RIX)?fvJ<=6p{Tj$86nGvT^P3g!MLYlw_^-L>L~LTh1<LRF4AZjmR;#;
z;1K1T{UuajJjX@fxlYih$eP>&XT#VS8z2j?lskMCq`@mKmxk;<s(XwvI_b63E&?uh
zH<26?-dswjl7k~DskGDj+D5wl;<M@It?TL1<p&VZ$be4c>Lnq%GgZvJys;b<V(!#t
zCbjhXyHJN6!JDlA2CB&bZcx=tb`d47jbyPWSzc8kCY^=dFPvP$R>S8mu5YC$pTC;U
zuOp2=&Wc$WfT^Iwf&-5UoC7e3Qg(5j$Z+pTZK?KVN6>D%B<mDTCA9_kaU3zm(?y&E
zvwtTIarn%1h@+<u@BZIs?56wj(i~tufocEWzVV;_oB!djf8CS4(bAiDk9X7cofp!D
z%}wM$q6iMxO8UfOrOM+*8`_mW+P}teoievOoiWl~88!Kx_h%wkgwC_lla%|4>g?WK
ztSRS955z3U$e7Me%{~7z#niBK>efcVGt|3kQl_mn4fl)Lh7;#I`#l=B>OmuNQK+N7
zD187-@gO@LGZP%S)9bozr!d-7W;Pc#pPErkJ?(48kq42?peZ8Bo1D=5{YE@1)f&|{
z;`ZIqK+*{%<&cJ`3KO$!-epzfegtDOvLELgK}7~YKsgg|@Kl+xOgrI`XT-D+(SMR|
zU+lu0tdll(xRyQR00D%aRLD$x{``gX_|>P=;@VPLU0Ty|%WQzwy#g_9e5v-+OXKb9
z*xh<{U8mcTkoRy#%uzalch>@sHQroG!@UC>d5q5<$*+~t0Dfi~h)v}4iLXq#!>dey
zxw^8F9(&^P^scwP3nzI33=W~zMqBow)|xN{4*lUM*?6{s7Beg>_>zWxFpdNrSN9QI
z7-)F{v1e~`C+)x{I~XDJ!5E%zgVB$^^lrL;ew_o%Cpg!~t%E<)?`-_9`_nt=+T9n@
zCAe#@^<hM0ig7GvzMNdSUe}15?SWT7l^g8}F4V}9Q70opLBNcQBj;Ei3^6{QItvFH
zP!@Eys=H}}sH7#_o7D%W730m;tK#TG-50wp|Ljkh#OB?!&Nd*oS(cS>mi;LRyq&1X
z`$9ccBs=vl0K&(#A@OApyEkB;YFz+Bbk%{afLqvm42a^D)t~IKmr*+a82FG-+e4CC
z1TI#N{T0c^fX&p70vuK5>NPj!o7&CW<3PJ0LT{4UsJD=AY(WCJ(oM@C$cu<qAt9$4
zX#vv23B1qNR@T#b9PsrEAO6MkIq&|ww1U%KIn*KJ8psn6EpSrIRUCQj1lt^67_`YX
z1~nPd_0yYd@r}q$wh5=27jY;VR8}2&Xz9f{bO35VmA}A>{p7$^q($E$xX?x2SKzAr
zvFD#nA9?Dd=?xD&lIZ7kd@x9N!6a^NTmsOTn|C-w>%>ZTu;2hvnp|KObzG-|08`Ek
zkWS|&7|a9YXWK>k9ANUl_~F0&({mSPpQSm#l>PmxK23+?|9z>y{>_8I!9(;Wd-3)Q
z>D<zVw9;KsJ|j3Z!Y|%zE18Ho;V@j_26ft&8F#T<cCkTtJge-f7P?N`>kuI}#k3J+
zZrDt{N|C;&PfH$EowHTV{v_Hd+FZP|42&vOju#?A<IOq?i;N&tHB>Y|Uv@mLx&3G(
z%RD=*%?c7}Q1P^l@S0J6Nf2JGBn}ZdzvOBhedpx>i|CWLIiw=ip;zvUOP(%)24L9l
z#p%Pqw18Iz6mJ{3ek;;cU;!|K3iYEH_QS?f@|xMXFv=w1L4eulPoqALRGz??PxBE2
zEE#MmLqD$ZyAChI->?lY$+bz^T3SdK;IVArfj>?wxqC=t4?py9+J?i+2R{6>>Aj!v
z1@Nkb_ZJ8@Ieyt^aQ_%8BzjsF>74*!Y?v__M#QOqS{%Zo3u>~(N1%4bQOcBLru$?N
z2rN{y#Y2*#V<|Xr1>^_&1g8Y_XBBOF<kIEz!55xRHx6#6OTAUZ$n2zZ7q&1K3+XDx
zb4<0F>`f?xV<&Z?rRAsEj4?7iysQgV8S}i|Ik=VX9Nt0-wL_%O!T25T{K?Xo`KLL+
ze1dD>?|tJx8NUBJzV?sz2W$W9AZ^Roe{Ss>xVdww-Rq>~ZXX;D9UsluuT3Cg<(r|<
zToFCn63({szJ|CF8^{S_rqVBg2)=C#z}TlKR&63-4x*?gR^`nQ)P}A}1x0y{h8n?W
z*mdZMWBu2i_ZBrW&4?01lCY<(zF;T~4E#=U+xB_3HM~DyHcK=r(E2X|pT9;MV^O$-
z;e|HxveiRBGp~)gbbv98wfPGpaju0RQmqFwA^VAzAY@^;5!7U$u~CuN7>w(jIspr>
z$w=)>4!`kmF<#R}rZi$lpc3oQRzg*nBp84I9Zp2QSqi<yY)KqIAb)ymnjW}-?8At7
zZyyt=t$e<i5R-@Qz3VeRGyTNR|8)AdAN!f~-nV@Yl1W06A-n^@js`=WBeSu(?u(lp
zHVG?Sq?z&`3CoFE_#Y=jZbMCW1cKj3Tepz>av#SvYtmvxunc3=v^HXo(I&>_;W^R>
z7VI~VkzQNqr?)@wXu7s{E$sjp*0(m(Cer)gMewDLT~J{>K@88+yAJ9yN@{J671WBX
z0<HmYXNK$Z=uUcW`!O7pyqmVK<Cy1B_ox5GpZ$rsfj<^D-sb>Q4aTeeI6OT52P-RE
ze{4KDcyxEXlb+drGOezy=`8OyTrB(Ga1`w;Y%vnqG*G(QOtjLx&FWXs_y(=#{>mI%
zfRa(b2F*ixL6Gz#%NHTE8e)r@oyZ?{qTwy|%coUu%~obO@XIwEc+uJJ#Cl9|6Rj<v
zT0?_h%_3gZX!oc^q99efvmK?h&ZH6tk{S3?3K)|5S#WQRV#N^nnB|tJDU}oy&DZ=m
zaD@bmF|O5L8iOE;x<70aHbHrO)HYtSGv`AUrV-$g!W;{?J7o}Zdy6=5SC>tY?q!1O
z{4+OXJO|)m)cF_&6P;nCEW-c~y4>QAV8y`4F-~b*zyl+=E5GnSJ3R<@-gCEL8zHc9
zbRm(9c*-jzjQ-k6`hw5>y!3&e`?>UiU;6p<wl};TwhtKTk+W?3_N}yv=zk<<@sXt4
zF|#F8cUP|=>pvxi>Cp(uFQF!TXdP$7^r0rhQObzN(Wo%HbAXH1fvu^`BWWCY89&li
zA~6@~VF0KbP_wOVUDo#jq%7hP-HyBP0BboiQ_k`gVqm4xp|(SvgVqQV%RzeX_G9?I
zo%Z+k5W6!Oo=kSXvbi_r-{t`GAFu|#>-T*3@z?#0-}2uqFKqq9-pL@{KDe14yZRA$
zlANRmFHY0O@<v)&T-K(gzBc|7IU(Iso~uH+{IE>5sN^!U28Jpn^XQp_6guginnW`-
zJsn%a?rr<aIB27}wI5jZWF~M{EYBZ_l5>?lW?3Sr#VwamYW0&H8WR0z4tN$qoUj|T
z7{M)oYlOBYLzWuUh^Pe)@`fNV)!n2;PRxy|4;XemPeU3kh!teMZg^9b$3ZOPX_I7Q
zW|}M`c=p{;C%0dQYHNpQ!?%{HxLNN!D_A+<RF<`jQi1L!kT|h7dBCI?`{ec)s($nw
zvy535aAh1L;7HoRXGjn63mvg}eZ(sqU;GVwt@P~qb~?8Q=^RqUGESV76b$ct0e&A`
z|KdVg-CRrW`Rreo9{ccP=|fL_2mrH`5H1N9;FGioBfDls!^|<D>dcvqTI$s}LMlR>
z$^h=li<g$t!UiN5xJ;kifl7~d7=lY}gHbsl&DcmY)<s@;@~pFSWflMqZAc<KER&J_
zyf!+)5dla{)`Jv7&>G^ufdCiJ;n1=pe4h?RX@L6%h<Di?-$}>#YxNnmKS{T);*R0&
z`+xXbe(H8}b<e-mG-*B>pWq1m#=r2D@9!-4|JJl~2qPq70~R;oRdy+DuAW2ArWNEa
z;yDuGx*4v;!JRe@^CGOGO3kS$wg<o7_}M@pjK4w23Mg5}MAF38&-k!aYO8GE4m!+d
z;(jNdR$6#4#qYkUGXXDy8N^z2CDVRWV|Y=Gr{p1J)U>qOdk}qyN3+Q(N-(z7$#hNB
z?TO&@Lwc#F+)14VssNFFHX`n$m;of7_P#r41h_`rN5G>2j}i4~N6brY%r}~A;AQ=_
zzj9A8+6S0LfMxEV1BYFwb_`&_#H<1pT%-3G?J}~_gk)p=w6rn=AhaB5S_lNRH_%Z2
zj@@?p>Z6P49RQjxKn7{WLDoT-AAp^NAb=D~3qyd_!Et)=>eX}$XR_dYuXL`ro~~?O
z*10dV+n8!HBsZqfH)ereWXhSQOay~;xI9d&pZic+SZ}A{Q#)zl@dJQWPq8Odr_s(s
z6_%ynGh3zJ6|71LiK;EwLvhc9DQ6)098C6bBmvZJ$kse!IjDCR&J6#oJxKrN9K7k)
zSCqqy+hW2)*y!v)*kP<&2l>lZ2MONJ2sG`bJI@>;o59{+`=P(`<Nxn^EzLjM2+Spz
z*I*?~Z|(e_?T5O5U^F@TG^!j1!|RHVk*^6Cs&m_>mch$Km_oyy^XA4k`j7XS7}#3W
zaY7kQ&uRgu*0y=wkW8*EXjGKXR%FiIxW!D&tMSNc9WNjgcU2g%&iGh?ozX_?S*|yi
z8K6`iXx!s%4XohCFJn}U-GPUCHiMplW^*jO<M}PTj!f#QUv;}Jo?^x`%_Rq?^yN`C
zFpT#6>~Nax1Xg*rzlhq0%FO4TeQ-1hFg9ZA<&oLv7dMKik8V;45R!r4!9-PPuHB`W
zMG;&Y_qa2LgO2J`J$5fY@u(PxKC)koWal|azi=>4x4RFd-}k0pksihxwFEB21>8&<
z1tc9x4s8TCKD_!!y8OtwG{Q6O14t4>CT1}JkX?i}9|DRasLiVyt1=mh(msUr1zSva
za~<AfeMnM!se_X!CFh~K*d-Rgifgo;sP6`rHqB&MAeRBhc-Jui4(<(U3kQ+GBm*GB
z5d=I$ZU0t!9(I{A+^u=WlcnWH3=WTmUOZ720GwoymY4(+&ul+~^JcD3e&mP#>Kt4e
zo2mLT2bg<}%&WM*c0ByT_Tka`F~n@@Xel;}bA(uKbiy{bEp={!AZ|rBuit9Id`j&U
z5`r}>N~l4K8@3SG@=NJTU0?aVQI@wi>WI6n+`_5XGEiNU5qVM6{HkFheyGX2#-suh
zrwKDNkddU6>xHoXaD^4Eit7!uvEalQ#qit;(6mDR_jt~9IVvzzUbF6M^|;gxuIIAh
z^T44ImmW}3+RHT)MAp`ejjWaBbL+&@h^T(__i~?kk+JOr9S050tJfLxB_$&b;&Q}t
zK;yfJKe$g`XEIDS3Q_>uG(Zx}AKkl}HlF&a^!~T}^7H_`so)*f;StN!T){?i(&Y({
zcm!&wE~4hUut%P3L(ZXkYr1G)D`tXi^maN@HhzQD-yK78xdo}^VVq2ffXjsAoZ-QV
za-4Lj2$si=QjFnW1gWwH%Q!|*W2XKgZrg<|=&8dO)8|}zAT2{mYQsTc2Y|9Og_j#*
zUlgdSxv=yF46f|pDCyoZ@|G>Zelr5o__5tId1eP7H2A$U3w8dr8i6^$e1gm2+kWe>
z`^=@({T+kx$@{J!9KY#>z5O)Cv8xjVWLZWD+oZ-A77}ZmXpl3PG<@?rjSC()Q=S=S
zJ8pC06CKD@)h<v~5DdHDqC2EQFoG2zQPgjKqf;Q!ueel!pMdPFM&<j9x=#DQMiZ9T
zApF*na90rRnE@y@+d5VH>}o{kW${Z1w9C=s85mTl#l#z;_z)4aNRND?axFea6<CEd
zCACfrN^Vn=Q7B!k)PU-?0|q}+zaQf)VrEhhB@d++qKP@iNyI;((PqW$3uM<zTNy!b
zA~1)_)o`&aM1=9hr4!zyz)UO6%k)n^Mggj{=PBJ0+As#u-&^aZAKZC9b)NX?^xNO|
zo^%;vz)2sihRO+Oqku|E3nw7T^5k0N+%Q}oa>F6L*8np7uHpA12tu_b<e}#opQV!w
zR9iUKczOr}{2DSnpTqaN0JJfj2ZaAa^P=<3Ce;{iM{up4zzv({*)XF4$EJ-S6+UzX
z)z*^$l*#e!^j<{xLnoM?L00r<21l^PEaDhxN?HI%ym!&Jey4}j|0hU>2{j?w(tY7>
z8hq?J5@bTfc4_j0Z;|=i86z+Um{+(g{@(k3-AaF<{hxQcYyaKRVEVa(7Y65thX}6f
z9;BUzQ`*O-Ii`*m8*D-X%)QT}`S6(rRAYvXP~;N;wUL8pI`MI$Q)qG(%aS?1XtaY^
zqYpv-NO;61Hi&9enQ3@~SQg?NpRx;R#nl_<Jc}C5fy~5I_8TYU?5&7){cN-U)?5;o
zW_cuXHDz}#mfvJtYXM3l)<!OiR0rO-Dl~Nd-cwDD0i9ZqiMs(q43ktc4m_I93dKY1
zmepGh<jqm_XBvQAh`cQm1H4*tDbh!v5^kiJ5X~1SBV=(EmrFpgo}vTNDcV?B9F6zS
zB^ax^NJl&tf+bP=#h23U;QwvdFur&1skHyGz4Q&g>{q5YEG}VKsD`q+lb0Vr2O;qA
z{5n)bf14*(4q;>;_RUbQqbs>bWIz#{p%=f(#hCUJ{6XxP>>Q-l2I9*$@mvo^`H{JH
z+x9URYhzrPnSkDLsB;<Fnb($4=OJQf&~n&r64LMwSoR|ir5}Cq$#m`bIBg-_%}?*#
zOrx!>v<bBtd)xvzE+L~p7x~thr>t{0NQ*b&#rDFDG`e~_^>)yY{w9)SB7O4wb<Yu)
z1I)`Tj_>-CFa4~g-qQbcbeMj_-Mzz&W8@thLX;k1108fH=`I%WU8I@V$J!oaBUmCv
z9YkxCC<vbe3^#&2fAm|>Dk&>!`so`cqeE_Gc4<}VZg(&$(RkiGoE%{M=dh|52r|NN
zFC!|Ad}empZ^;x&z!j<t1Dl{?QC8J%wIK*$t}OQ_OHniPP^Y(7Ga7m(C&Kv|a=}gM
z#Cy~zwJE8UIVdAg^_63hN5;m1FSi8wklFZJ_e<Xlyld%YM#2fbResh1MqE6L^0*ZZ
ztpl*nab^sJ9}avO95{zoM&vUqVBwNsF+%}h;$CyrHT#IIg8<2Jc(apDeO1#D0Bzk~
zZl@pFeIZ?b@`LH?Kl_)bbr{@Fx&%6u2elakH@gHGF<2nl=OLFtf;qWEH)mtC%o*6m
zPdyEnKS|)U(eNl`sL5{brtU)<X<-fbZvaM);K4=Jmn0nynKLm2PD(UA7~?OYyv;+{
zeh^r{4Z926t4~gl^%+u08y=qhx4bR=i=X+q)Y&^q12`z~Oq|XEr9;Mg(p8#(=^l3@
zWJ?CPEWU6vttlv#o_V8TZxqlwvw-JcH3D;hxyM5I$6xlo-v|$t-@9{o^xhjcuBAKU
zLwKLSfDaYe0D+VzOYL-szx{P2w*!|of=A7CWCVxA(IF$3Zrjs0lGGb<rEsD4YXOQN
z68XtY$oRn}6h3qchpeVZbCHR2=J=eq7f=*wLN*FlxfE{P+7uX4SJxoG)i_2u<x&$^
z@^cjzZ;2?QUrzk37kT8$=hEh=Lxi`&xl=&NQtD33%)9+$Tz^Ooc1yD^Q{Qop9Gq)u
z%n;h2&q}{+bdK^Ujg>mQ(Iob^0Mrhpc!~b_QpwHN_nEOGUC$|GOiHmeLXR*nNKOEj
z3{w@6F#B@LQkUlo(-ZX2a__lpQwlI6G~o08(qj67=RTg^^XQw?7j58Z;vrmmas3GD
zsTZ!jnBMaC2NCYVaRKAC0!X-o2>-zV5(Om}5PP3@c@0L&8iAmBaj}sw>7?;4o;gIW
zF9dw1OEmmLq8O~$crn0>&+>l(3{+!l00@K%jQTjE1zv5P4sxV1y9U&2V@Opac)xX4
zp#HjY1xYF)6fHs8VVz88!-~=O4$9S`V=zYbrYmW@d<>auTX8|W1$zetWj}+N6*d3L
z5tswa%sTi-U-n<Usk^fDk8eEx;^$n)VM<qVpx7;Jj=>T(#%e!}R+iGV4-p>=ZvqdL
z0nRR`G&4aKRmP3cCR(Vm>B>c;a1Y6b<$bbOaJoiAo%%b$vggcjSrjy-v8JjC$SMS<
z=A?~}u3y~fxpu#G)}tzjUx1VWU?zS}R~-xdw`MKENE(l$vKc%OX$;EdWm`@x62uwZ
zSfdBlHZz1bC(hJxh1-v5s7GMt5ig)_oUsKYME3IX#MQ@mL@7c2V{l@OoND{yp&qn(
zv?42f$ab=rV}pE-L$Se_@ZfEWLYxsXX~Wte=|z);O0?`W5&l~RNE~ARPGQ@bA%PQ#
za{!D>Hv(J%lY_qB$D(Opf#+m;9!Va*<KO&H+W6&Pn%=pLv@59jgWE5ppZ?e{rLX<;
zN75P)qDAQjl7Bjf=f*IkKMSM#@zBSbM7^v{Eq!$i4TLR4r&0_6l*k*FE?{=fV<)X)
zv*9RbN;qT0wor#ByA|8Ch@5HXp(b0x@yq%R0M!}53ym=`_&nH$1h$g)4grSvZyAyN
zZAe-??5qtpF&?P40O^ZyFa#I|XD$L%#t5R`M;fRFfRm(0YTbskHqBr68i6^$R3q~b
zf7ScGe0Q|}e_#Ej$GT7N?WSkpVY0VOzeJexxd?Ig#mNdxea!I6l-@jF47;HN&zUy3
zw6;Q|v4EA-&o_)%9TU54`%`!9#EGf|(i}6;6rv*fU^?jR5im1@Mx^cHEN3pF7C_K=
zt`P5<!#4@#4Dw6$gA;tUFr!SSo@&Gle&n*$MmJv1E<nqx<uN}>o9{$Sgsc#4?}~$!
zJyQd2jDrBV`B+6*OYNEmr@J;zhF0poI>OQ;FcfzLn)yA3VP&ksLBYzfK??Pu3d~0M
zBGZyETQQ;Htkh{XbyiLW7W;1C=j~UQ!VS5u+h$`yi5qQg7&y2`8N2CYA6-$`HGG_J
zOYmsGW&JD;Z4S6}O3*Vrau}9I5ayrX-${S=$Nyz|&xMDPbL=F22nk^KaHRNq?{25h
z-2!vQQ<v4A{nWZl8_Z&wT-{F#$32um^SP<rSTUDFOLg$OIYQG^GrWD6`j5iagE*A-
z1F*{=xnvt0o;>>HRY0{H4=MwAZ6IJ0aV<R3cuEaJ`v{C75fz!4Z;#RwPuxn^y4`dP
z8%)1@4A*BIbOyse-K&`|tqbRZ4#0%ob8MFm8SCO2Y&flBh%L&Pj+T`e;<Cu*pYjOI
z0mg^o?|;!({-zfXcE0<$YtN^T9c-uP5tV$j43841!<N9o&||T+fQ1F_MZPFBK0+ln
zr3%*FI;k=s$(YFIU`9wI3m#nO{$%F0WCXEc7?vHuHZ{Z7Ug|YN=bS2Wqlr#97aS#!
zP+GYnPyq>SpV9VHFA;IS-T;uecNSGdJV<p&q$=Ws`)a#|H?Q1Rw<kc(=!_bc<5hBV
zoS1rVMBvpHN)Bp7?u!xsGdkDkZRv>f=#2r%27aSr)n6G~&9bVo!7S2<Pe=YOzI$6U
z7m!D#v|X2Cf~-`Hh%XEM6l1Q+-G1U_28d*E5n|ZD#!85_8Exlv685H(g3YBZAF9RN
z#ae5f_Y(kMb423E594Il$DaEr0*m{|&o#xd$F219k3W?@>zBU+0h=(2Ln3J*`|t_V
z&pZSWK#sA=^9ahkOPL1>Du9HPcT^$6_95f|h!HHi0I?yW=@;<V+C*b6BMIjoQpwyv
zxfA64n;>2VqRyyuoVr_R*&^y9NU@Cojsa{!$cUCYkW`jo$9dudA51THSJM&>AY=c#
zOvjJC-o{xmT>#HYZwY)QyvLvd#BrTCLV$;p0sKe{P-vUX0UxwHp7Yl|Mqmyw7?{8J
zMPK<<FC6ZC_fyxOPyc>@Ki%wrJA$aZh?TUC&9ebt{xzKH*M~IIhbtzzn-;xgc<K;$
zo_Sni;aP$g8F(eR*oKzQUBn<rtrIF4IjdJt4P*7n4+4zleFiXRlTRZz&2^u@Wk8vc
z6oOd!?+vnC7ghh@?b0?D02pzK+l@w=6p*>$x?j<yka7Y<X8L8c5&yXQFi_Djh;<qv
zHfckhx3}m^^{7AMMLDPL;FO7YW9(^cmYk_VHG@J>yQG(oNeqoFY);(xv#Gr7Rs}n%
zzhxlfw`^Ev74clG1y<>5vLamsuvPClh|_>U-eOfFweobqyvSnCa;5^+qkpW0`W)lT
z4mjBP5~jwCXWW}=*v@Nd)4iA9C&TpbpSzZBAH5}A0jRW)PDv6Z#&6t)R~U~kMy|4x
z7m$<&!IUj{mvtr}=zPf$hRG*+_!k&CKEDLD7fy|2T_;O$8n^_t70+@()_~Rs=dduQ
zWdo|M3k(;4YD`iM)N#}{0Av{TKZR}R-1=&I==^#bo<J)BuQBdV0vLg1u@A2?NHHtD
zURom90ATpO#61t1aD?_dKWY2?rTIW#AouUveGf2y=ZnAU4L3$R|JPI3o<)@M-E<wC
z3RYNxp9#|MFQv;{%jv?#Qd(bK1UJ)#!4Hd$x>~03R{9sb6Kq7#Tu%<=wqXl@Hv%NX
zq4HVS8$V?3+jg>(l=57@to2z_GvEB~oyBj#>GNIrqxeLjVa*lp#E*A5t;xhFZVV%I
z@Z+<Q?{S~eeCM8XeL;QME<58xFDG73o3ivI8*K(DCvfpV5O(iOru_~k7DrJzO2~ko
zX};HM_OB??+8h}c*sdaVl$r1{yg~yBG7B!JL|-!?Yr>6SM<H4hEZ+eo2)?k&Q83S~
z6<A9TnZR2U4Wyj#XJGKkOj02jYO#55qdhSnRXJW)R-t*@3V6A}ocFQJu+2EY=sDOa
zXz(6TT_l?i^+mja0mkUxbcAEY57W;*buIn+cj9zPNHI_(YwUp}X@Wz)nE$^8O+o7x
zV(njqB(n>Z89{~~WQyFxM}Wm-6>6;WcyHklmk}cE5ga*Pho>0pqxx-#-{IBSf^mP`
zI~I^IV>H(v-e8W0`!S>+9)-LL@3#jxmech;{EodoMY<TaYLWI9sKI)8Ujmqrne4!R
z#OVE=9bu{Y<gTF-TuYL!c=OjiN8r8(n9<_oJD-30+1674mz#{h$7bncw`?shr-v@A
zrUx#qq)Pyp6{^c<i-8Wdi^WDzp&pirEKOoDYGMqHrDC*|S16J~==^IfW6a`!(KIKT
zD#>3WG8N0@9DS&!yf$Yz3sDJjTdT&bMB|g|3Yu-P9yEZilDa}*u)Y+oJRgYVAZkfH
z9RyJrlW;a(4pLT^QPKdC5L!a@rsjHCK@5uxt7(UcE@n{5)w)FGq@lFe%Z@(B*a#$O
z07tuOu=EiO>^B$T(sqroOR;4R<98q19QbH;TU*VJBL=_;e=r#Q-J4@z7QhhRUdkZB
zb!e$BZ7lg`_8Qs}EOo!3aIw*tL=*YS_}hs2!1y2UW$Jo?3gn6?Q)%$cIYU8=vMHJe
z9PJj=aE+(k^m9+XkUsY<Tj??l<hZ{@^%qpjs;`gxyU1ULWSQL!80>E%>(@5c%OUf+
zm}A5V<6f>H%6=1W!z~!@;ZV~?H2#zUoT~_YY(WZQt~CZ%!o`4g7l>8LpEf14$AmUK
zwNu-Q!JEUhwF=t`&YM|UTH&F|U>GgAB|XPFJoANkg7-y8K8x7%{EUsS6D~P$REH}&
z`RrP5uI&5%+kFi%fAtG~{pY`M?b@%ofs^{K<B%mCbcD6JwcJe)UtCIWxB`H=uz_>D
zSK(;_38oJ{Ee(CLLI%J%SK}M1kcI*Z*#p-JN9fjr=ooR1`|Xnj7K%h=VyopW;!46$
zR8P7yi%LQDs<rJ@x@qsxHZeo$Ox;GrtLL2%WqO;NW&jlt=|=dB0@i?|Scj@Sla5HN
zqi&NgYF4nJRuj!Nj?rvJHV);9jM_#ttDy`ViVWA*<4hEsmY43cXciS^=soI++nNlk
zbtTA50p#5O7(4G>#FMB?4kEKsi=ZWlxbE}$AVCfg@;bAJmw%6>m+`K(!~@6ltd%2A
zF=B>;hf>QB9PlGEEYpLHpP6q<u`;}!zw<X8L}teaFo&1&Bft)$<C|fqoKkw>{i+Lu
z9n8gx2uS{!CtgUu?j4uFyy1)&h#WQ-7BMdsa{m}W0@q=HO!ok1!R(<8$V$$_T8K5H
z>g%L)1Ok5odkXF)kiecYLAK*5JjLWRLJ5Phn`{q&h1sB$by?jdKtM{uxipOazp$|a
zkhrUc${m@^qK-$#XTD1{2a-VW^xyD<7)x8dzE0|WQf-;}M;n3r8eqnq(VyPAg%jx@
zI`5O%VIlRPe?5P0Ej@H`ExqCLX1aWC1NN6*>I+=(2aaZe(u?l5Ggh*_B}ksFXuXWf
zcWFlS#|W@V)=gp+7*U9ibF2QCmFR?31s6_y7Cfx(Lhv#%%@Tm_$n6XO5I*QJkHb(t
zrn&*8%^^88)IIfva|N}Ct%#BRD2&KE&^|dX<v}x|TU0$^e+dAKp)iTZiFGd65Yn2>
zfBj?+QHIJM6!YTq;6HnN0?4DGR$e(VG#bsxG++ulQy$l9(4*%jF_;V&L_gOM;KAP<
z4Pwl)%F?MDuZMJFAWF%_%ak-jB;h>-4-)<YE+Ms8&N2(KjQnNFtW0Tul1hMzXJ961
z*){wTX!JZYF^BZSu|;k&ua_Aurr`dk?UnSwXYZuXe#6D|NZ%zhp7|>8O1LVAu+DZv
zvN?fQR~yI1(2y?WCf1RHCnX_qHQ_r{Zws)EjG@A6$Ks_ICx4@7BP10DbJ{+$qboRc
zW*?iR^`*X4VeKv?7aoHQDTsT!190i#y-OP`Y_C1oXqK2h2H-LgbII(=D3{ycmfrmJ
z5+iWm0?eQL>@QupzH{f-9w7hFE{u^Aa6jZ;Ha3^i#f$6d^5v~`;lg^_#ChP$Fzzp6
zG3zK~rQRnNxw<Rw4v-ZY`%O6(%z4;EGzZ!C;=4UhOc3oR;FNi*mT`Qf#8im|wT+~x
zkjis8zor9Q4M<si<wUPuFLqzFi^MSi$U(s!7(zsG_;dPSx0@p~8`I;|QsvJ|2F=~;
z*+iO4xdIFO!NDqU#fuP$snDYuwt{w~$A&*v(pi*4gxuyfh&1mF3DlmkK80ix(qJ*r
zQ#uJQ*rbY}Qnh+bDd+K1fQa?Mz#xD#T)i`32$W0PvieKU)hYh{y8~LN#RyOW4c%*i
z#RJNy60_$ml|S1s=63-L)BgKsf(`+P_fg&K5)nE7?SsAa3s2up=ihZM^-pjzCBZ@x
z0>|9IAoRxsV5+xl9tc7lGah9u&P)=;81BVA1WL-$25DnV;`PZ>AhW;NcEXZNBw9xR
zm=)Mw)|UHe0?h>Xa<>Bq17=GGp!5;v)`PLXhdti|$j~mc$gIkw0jQ4vW6U7li%&~)
zZO*@G1nyga84riQc7GeE(E(fzxJeL{+s6Vrzuv=PVem4!vXQP_+DIEvkM*b@#^P)7
z{Bdyf1el<^IZndG4+9ii=$VLTL4mv9c(5QCabF=$uY6_B0!;lqHfJu)j|!;wgYkAG
zhSs85Ep^I0#&>LR)K+dKi-@*aM@5D;9<_^2Ikuoaw??Fv#9`JOCl_5nql}C>OQ?_4
zaAhFzewkQdb0OE)>>6I|TZInscfU6m>=KltO{m-IinBzex|+aLe&bJWPvHE!H=%X8
zj9KZE5mf#r;$2B7Hm?PEVxHrimypneHEws2l;P4yJ~zfyVkN-RB_fwl?DR`JD<)3E
z6yU^nN;m==aQ1o+0J3Msu&fho^bB#Kzm|UC>fQ9#N7mC@a5`pNZ34j1x~~3VT&QM4
zodOM4%b6FVt61Yc$}nT>S!C8AQ^?PjHpUhNr7W#J#=!8i98Ju7Pf7hzF_TxWKuS5p
zo~9ats&A>rz(mp&S*iM^p&;;)p+v#dT!iPS^F-OMS*^qQ$1(!<Ex?SX!~bS-d<3_{
z6Qp`E4rc`ba|H(d2l00S0hKG{Xt1#8sX}jmZF&;JOqGJj`mWD}iOKh>o18hT38ZW_
zGi-7#Nl4<T1pwona}iqK*@j`4<B_<lR*_lLxY>gEI|#ZnL2$dL(}p?a59Ux3T2Obs
z7b33qks#=2K4*lysIhEUT1-yBTRBtRu6ac+4P>r#Mj6piNBOAKT&rDjNkY@ZU!7>{
z?2x!tW+i{BQ7crLcwXLWuKtWM&OOnAS?qi1L)c*=KbY$DNzz<+AH)05KtZtL?V~Z=
zJe>3!39!^sE^YBPmoAhsSl})m#ux|L@=YQUoy4DLH-A^i740VcZeDhJmwL%Sa3U!}
z))tMWISp_R@Q>;&*hd<uj^AYir<B3RTt&y{;Kg<4c#!_x<JZ#V&we<qL5hL$%=#mV
zgJY-8s}4zLVGMKeJ0v#a<q$}Dj4~i;jE)&~&(kR>?XaV^LRGf6T<xy_hDU0!{s|8e
z!?P2fX34~yJZug2So$m=v_=5RlWD{v5OCBnV+!=jE{tcaqxq%#SAy9;*!_Hp|A!El
zClryfZq^_UUs{J-^5$w<Lt2;>1X%Uq58q;{3m!lOZilDi*;!yt?nP+|!8F1%BR9)+
zmYX?<wVWI?T*TA$dJG<W7KGxSvaf4un8|ZP#iKTn3lotwNhNkily|Rsw7J|sZe6Qd
zD_UwQ43ieTBHwxSE)2@rz|=GSsJ;9~S=ER-5wf=mkl3_}>gO2|(Y9}76`C(n>dMZe
zAB~A97;2vz(v1yg<K?v8ht-Vg>>lrvPodONK3C$ztzDU>`q1hYJ{LF}_c$nv)O+6p
zP{K<u$}@Wq$Af{9s2l6sY%H+F(2L94VuzG5HyI@az7u3jDq&V+efRe<2rv&IBfX_@
zpXI>QSbgXKJZSizP^zH%jM58LYmD-zI+<B6(55jYhL2slo8I`sM*94RR*?N-*VJFG
zI<~<xkPB`pn3<gXl;eWB_{h=W0gfd{hGh0vfX4QiwI$df0xU6rs=O(*6g-`ho^Tu^
zs@ynsZH6AeR@A2L1riLS^{FbOJ(lnMW1uB7F;lW}K1yw!OE5DVVdm?7DZ%`iH-6rQ
z>*G6bq8H7v42yIyK~N>Qp*5uUUf)<vs~hX7zlNhzp^s&Xh&C4&^svUIJArXsY!Hbc
zPA6w(4|d{`0jUCp!VQ(_Y2t=S0k;2U2;0ns`qxGk&CMW^8_44T5*&9fLe$PVHow@Y
zEx63gac9`|hT7fv84>C;8d!PE*{|f(&+gZ0sn5%!WLW`LmIhg;5Q1c;sq!*gic47<
z5aZM{!w?^JMJ&Jg_}Li<j@DXiZn_2(G3Nvr`^CUYr5OlRz*C-yHW7fVKD$?JMFvbu
zx9^gR15uQJ>Ih}$<|VykfU&Z|gNz)%RAc^U0Av!*7~nx-E#p2j2@8nIvC9M(UMH}O
zVH43iPr{slsKe6>IrG8DwV^FoP9J#kMtakQx27u~>`d9e;HoZ-o}?{9$UGh~PBwxX
z6G{^F2-9~>4bhHsXk(`SI|CwH5X+;=3;;1eYA5Dm=GiYa;&&MH0-Fr&t35>X^BQfg
zT^jY7K88{JsPBZ%0Jv5QFmnlJW;@J$y)OZ#+dg@BI%H;4#%LK2LXQ%9i*20WNLv@r
zr_J-5@R47_g5nwC=ANj0F(QBd=la<O1r>=gE=Po+u)_Y`c<FM=i`_A!?eZFCC(k!s
zT&~$))ph#*=9QZ7cH-}L3hRm!u-sDa?2>e`Y4AU5VdYMH|7QW_Ouj#JtEEI&AF4T(
zSdvHIg^U_N9Y}JG{+oe|m7LFjuWc=?BA3Gtg`jKx^qYaR4R@_x<2AFXX%01hl=x;O
zSOJq|c`!SU<rm4PQaVTkSQrJfvI9^fQ?knTR75!)jLJ1>BtXjm%mD=Nk;IaYRsuU^
zS+?>8#xm+NVA24_+Uej!!#W5nuMtGd4r8ab@~|#RFSOS{Qjz@yfWlP$1eqa!Lwcnt
zoM<LEPI+P6OZ$*Ke)jPj=}X>qCG{q|N+CmcW7%MQGV-ifKy8wl&#(XlM`j!%KiOh0
zjS*_V0B8ar$5DOYjH|B}|1*~J!3mPnE~H%?yF5@Zr=376h6VwboJk#C=X0<69PAYD
zlbw596EM4^&)OFE@wNLBU^@NId#5;)VuXdmOyZ>O<W~ACi-_2V*BF3h5!?;kl3V1m
zD5=osN8(|7--ymV78ha2v@Ay4i~v=(3q|x7!g*FXwf4w}mzCFqh$f`iZ{=De#znMR
zjcUB&Yc1;ACXT;ibGp}FDg@e#o~a{O=wn@)$qTLwJf$x4wyM(FUS*@9Dgw_)AemJm
zVB3LLgp)V<e3{%3FU9~_L!13FCZb@D6SV3%yTPyL&R6y(FIAdD2dE%jF|PS~JTHQ7
z<(hrYK|>$R(OUoKX)(9mOmK0%r0aL7M#b4ZrWPZQ6y*>w++8}n#t1}IANezKE(nQ6
z0LFx_CZ)7sq!4Jy7>@5em<+Ke6Nb&_L52E^bz^A8^1gx^icq_>nw~(A<Ac|B)4MK1
zI&g!27%rv8D?@C3ri7}#+x$z5hJCcVgJ8)^7X@fiV+pMJBin1LXmgSPz{z0(sx@r-
z+ej}nrg{vIECT!}sVImVDl&VA+MEHFpaDEpvxUE4G#j?h8D64VVDpa{f%_6*PP*;S
z<Ay_Aus9knSbC%I+y~^}Ryyy)19DZczfdj4KrDKeV1sC(XF6ri8dvGNs^Pa-QCk0K
zriwzeXUSWv$_CL>`SDBvrYs#bmU5}{^r2l&xN4N4VLB`PZ6k*<+<Ii3pwr1*Q_#f>
zr16W0dPRU$qycQ$Yy%=rwcm#`Qz+3wCx}+UsZFceNKbtD^C_NwXK<ok(#vf6+45&3
zMzy85z}9ycjh>Casd+dK##q?ZW_d-za0bG`Bkm8NuR%lx{ERYQo&dpYCQ_*pY*3rl
zwm_)@A{pLoe#~LO)OiI|+~wFi6y9N>Hj6l!kYqw?k!!NlWd4WGBS<A9yi>h48I98r
zlFX2{99%y@`u!jK&{OHs=DX4ZNPxM3M6$vK8B(Z@6BKF8G&%qaV%HcThCIwN>bG&?
z<gJ^xpmjJeX9!AX0v<gdUTg$HXy|m^RDiq4-u&ziQqFXj1X3-;?pRh1N+@I|ZBHoJ
zlB<q93T;Weml6zGRI@5Cv#9RtpYKb6!IrwH&CZpA6-f?d5uC~r^sjxS;$LJEKIvXP
zC5$pDJJC=Oj1dfTAhCm&vN*#-HVOiT=rOa%Shthe3WH9qL=#(Vb?ZG?b|Y5yH}mpm
ztcA_rvsY6l)JF94E2y-*W~?sCZBiOJu{TvwA*}{ZPCoTUFq|dKw2u@bY8+<tP5h|7
zI4Q|>2l0#cMOER6WyCvL8bsU(f?Cq(ez~*s#cGY_L_PVjXmqsAy{=;Pqt(10+LcBR
zX3L6kb_JY%4h$+IC_GY+90LrX9GT<H6vGYcq?>3=N|7ysDov_=-1n&l7=D(KKBN@d
zy37Tb&&&RS=OrzXVDp;PR_<v=KvDoE2|QJ7H2RxCU#c<W)Td;_^Ij}YjPC!G`kCm`
z0u!lxQj84LTDK04(vN-k#q_?<yquPhZ)^(hEqZH)de63q0p<eGg>f^qZ>rC9GC*68
z4pQ1V8mD#KJjU}3gp{;`e~FP&>tPJZOpXaK*w;^N@1+~VE>fDAU5(BNv@cOQqr=05
zuGdl-ssIOHQ@tsnZLDQmd#AacnEd(s{VBn~BZi0MNkziN%-lsx7tw=3zrVJLzjLVv
z?-XWwrGd}9P54KoaWxqiJ+C=&G@=$nvQRc7rcLWc5U5Q5E8;s#cx;WTwV9cVdd>ux
z#-eVn{osUEl+*ZT8Lozy*%;&{1q0}ZsbQ?#S&1S8M>d2<{UOzOH>9%gQflg|5v*VK
zoB@RaNA82S!1}=fv?nvDhzR=ysaH(OwBNlJzb*pi!&{?LZc7wu9Es{O^;>yGu{E$q
z+xc5=x7y`xCC&1iOehISql{bK-x}Bh7)`=+6`~W=$l6^?F1k(S$Gg>KX=BVqnN84l
zmxLlWncZhL7XnO?VoYt8r4|_UX^R;{TA?&EMsBb+z-NTqVPmG!r<MSs!~%~YhAN8Q
z$B*B*lm6W^?ezI?J&yoPoE3v)l$s1lS5&<bl;}zfyA|?_bzo;Xf&cKy3DWihe2xKT
zKmWv?^sC>w4rvKzzTli0WUvH4Aak<=%>;=r8Rs@22*Q(a8^<jVPN11!Jp@qYA>*+d
z;J1}71}ZV;O;b>{rG%1-TnwkhbHyufqnSN3|7s&}UrI1WD;Y#Z@z@BIRAjKbEG@6X
zb8J~@yW4G?(@oc8VhZVsRAg@G_cdn0i6j(Ru@ND4FR{i<r7`;o0v(b@rov8OGhr7o
zjQUC`b#%U6Z3^<I%^ad?UH}WN>_h<~BFu%O<~Z4_fKGX?2@lb-Fs?h+W@20GvYWtB
z2yzKz^;#=Wst62qo;MTj0OJfW!R<$1D}X!IVw=py)HodY_%wel?KY&$0OQ3Na8zSf
z1B|!Bk<dNo3@~hjg$4mf4kR)9-l|-NfsFo42ZsNPlYoH5r!C({F5j)-MO|l-OA}-)
zUZtrf^B5MTj0r=(zhlpgamwWg1iI}~#gKcb1MnaXM~*StOvd<q3U95SyLOP4aZ>DO
zBlq9o_N}yn7_@~JKmaFRc6&IgSU5nL1VBMO7}B~Zeg(^W2qy4HKlEJsoOgU)g8KkE
z$z)^zA-S6a@Msre)-}2|Lwp;du#*M2HshXNhhYeOhy4}%y@UO)D1L+{&<0Byuqc}h
zy~r4<0976Vsq7f>CNHtk@B5$bOMrnGJs|1jnH5a8r<_9EQ*^r=A(7q?3Hh*2Wqpvk
z0c%JpxhV0F!y*mhLN(QoGm%bo&5>2hFtf}=CnK7s<}r3TtBrUmR%#Fed&E^B)y*!=
z7%%Ors7$|X>8n4w#)F)ym#dUQ8YmodeUJ66=H#8)m9iRCxr{-OKE1R3-j29EIOv(?
z+RYU_WJ;L(Z<JBv)liFt0N@1G$5D0bkuq*3gj6oZKF-EC_^l1nVXcgR&0y~mmF#RT
zg=KZ1NwYzIvKmjr=^#O2LC>1nCzsE{=>ev8utBzhV*Ij8b8esYjsF?b;GNMLa@Hdd
zML+p{F2Sf6(=(a(n29x3O#Up(JHDHQqqeeqjsgFqPAY(wr6t)v!rj?(n!%{f)H1Yx
zSTGGGAp^e&j)gsd>+-*O>=x3=Z0MBA2iAw_v);6k2Kxu;06=mAAi!y3X$`7t<lML9
zm{jFq%nkrnS9d4r=bqb7pMPZ;$1p<*ayzJ`D@tByJ8edM9S)~y=MaaK^)@`ArRl$E
z#5WHypK)p(d{2h780(gK&NQkj@)X-<6FYxt1nx_K>2*3g6m)sCQ4dngAq}KpjA(ou
zA0MRg(S90rq0hyc6AO@HPM}bu4wx!3cVIMinJ-4Or|_==60N<n%+j{<8Z=s0Oxkd~
z!l#ND3Y{vb@Q-0MKV4#!GFwY*il`~jv5@yHkr@@5*(OdXaXDJ6yS$i;OHPpT(@w?1
zrMCtzQ*Q-G6`~n}L~P&ElK5m0YhzipGf#s<Kt<{OCnKUWAX?OWF*bGg4S|YqQ_W3X
zj%c*2L$l-&An5%JqRrx?z-p*Ps55Ys;kWAn0LJ{(YBvI}((r3QTnh1FvFWh;i>H|p
z_;3t36*r^)y1TD{#PgJS`j}$;w+zh6Stb`{I>(rypEeP?EZgsBGl1p<zb)v`qV*ZT
zNzb*rla=%%Pf`LoNoz0MO8qUQX<9s=o_P9ty7=&0({s;0nLh6=7obK%>VCS4Gl&zC
z$Sv&E<Nikakq=)_?|kqrX?=Vbk__hywnMa)$7u`!dSiSa;XC8r_6AVl;p~_WqxZ*5
z6@%CqwC4nl22zF5yG%xZQ=Jn~BDPKG`eR=k<1n}P&rpf_>wO6@{Z{*F>QWc6DZ5kz
zOUx%%<s2F~NSW`k&=_Dvu~?xkI{U@hV}XlPdgaS-UWRMC`dlGK9``e}W2LO7%;C>e
z;8Oh_aS+;1^>xu0K2w9#4B^a0R{KK)GKh7=zgU_PW0GsGJ|vVP!88EI%Qs#>^M2k>
z6&8^pO5qWNaTd~Il+AZ5%Hi`(fXTl&F|`7#Im!gTDyIsFZGcaH*2)mU585rPDPv}R
zF5_ask71jOZUyY{zH)Sd<8Z<8s_iu>*i5>7#I9qlDs5;6C>u3c3=9+`VffOCemg)i
z4U9=D<_sX>$#@V?4O~FSfKKkslr~r=87yWtmb%PrG89S_J3}{KfC^Q}3$&RC!>NV_
zn9zkw?jn@`Nx-S*;(=h+E8MLz+uc6Ay<j65wbMWO!1eTwH$0H8-h^avbSK@teVEp^
z2I*brp}v|x%YZT80n_*F0VJB$we;-WgY=>2_tO_#T15((1M~+8p4d3--=Ug=y3vgN
z0b_RHHHJFjovcjERC;q~^BV1q1Qu$76}6B5?X#tcF)uP(^S&nQ&6)T3{DY0aeF-p&
z!_hC&9g%VQeR3>7meCL=^C1G6=_FdntlI8E+{S`$QHtR?6G9vgmr+|07A-9YoJ_Y&
z9Z``&cLpeQB&$)XRajhwSzg7`2C=I-VycN66_om-Nc&kj8E<oQ$*ULo#~W)j->UYq
zO?n^&aRTZTvALr%lH{StSRf)}V^qUKV3<{Jo4Fy4%%Ti8y_#>Fv;`XKYlg#S@9YQF
zsfmjFs)8F0^5PoxRs$T*a}FFuV|+DJ8bV(eyf>CaF0WZx9i1QjQ2%{=C9UK#g3wzX
zyd@-;3@|d#8;B`2irrM*7ti1hx6gzstVk|RcjZudN#c=<9Dyi;EZrs}35S5^ft9qk
zxNW74BaSEZ3WE!B3!mE~NF1=aj9{M`@;GBi4lO3B#GY#tpm@S1p6#OJ1LW%8#V!B#
z$!!qyUK-v(5aq&3`oPmWX?5wrw8a=T?B6H1r|IUw34){nChev4WB>ly^p;=y_Ov#I
zO^4E)>^5W&7t=9-Wz<<r+sKE8bCQvQpXvH}dL?Wyc<&;5pDFs8@V1A;%({42ni!^z
zi8vW1!t_)zdA;#8v-#_uBXC~=%=GB^LyIsDj&Kf4kJ*u#(qcGHhetzz3?0Rg2He4=
zMg1@0W8_H|?z`0Sx&+gdMhqC-Bz|hpGhr531Xt^nvXTyrE)}9=q-z#L&IS&9)IcM1
z$W=@1DMv?gvucUem#;VKuMx)*Q`P`e<%SN5?yW8L7cG*j3<G$o1~$gA#sNbVlFq6>
z-uGyl13(5$J>rrF`>WqwWfp@uJ3BsJUY(5vuX#L(KuR4axJ6o$04Dn}@S23p(QO9Y
ztP)!#{Kv$M0FIb6-XZQ-{jhm4g71wtdT){nuSGUYw$3FO&8x<jz);$mqyHU%^1)^S
zNd8_s4201?JkE$JREZI+=*bllsAh%Z&M~98gaMOOH349x)uKQ^6_XAnvwL{B<q5`X
zfO;nYlM`e&=Q%JM6#xh0X6Vs1nY7b$P;LFQU%HddAsh25jPuXlI!dD!4g-XWZ3IL8
zi+j`b?;g9JzW7aR$Ph3PU@`w1^9bJBKEM&ntLgdx?}&flo^Am+m?eD?z|!jZ$(20w
zi2;@Z78>^%u0TP^4j_{@BVI-W<|?w;y;Oa_KLKVueWKS}N+<geyRl#v;p#X>M%LZE
z{j|S#kOoLQvAjfK8Tw(IE53kZUzy34AY<wt+GF_V31372gDScqIB#$t?b8TR{ZM8k
zgG;s@dYponZ)B_{@w3ln`d*nC3S^0{d0QfVlhe{{uFSgqhMmG$>QcNab`o}KiPff0
zgE1qe<=^0{?dmKT7LPcCKpDn$WAr`7n{g42dL(X6go7wIY%L)fo+;JYpzvQ2?jV*?
zp94d!b__7eV;Td}7!3o5nM)1nA}<~13QYB4(ptQ-)WUWNVEDU5rmNi!gc=<O8L=@9
zNyf^oC7{U8Y}Z45=4!C0$6cAjCPP<b#lwhzdzu&-@l9%BM&c>c!@!eFxyzuN8p%du
z?#r~nFvZN2ftYk_&go{{cEeyyVUnfvCqfcPhtMEg2gqClGkAi4XekQC_%S91w=HiW
z&)Gg=+Is2T53Z!iKKC)$!j&$-X_TIPYA4+}M1(%16Vq`)sv3_HJ$|}4Mwu=#O4G07
z&=E_6&-6m0Jw{m*sHntafhacgq2E2V1?Cqp0{11r{EjDn@y54*+Gjk5{77%Z!Fg#J
z3u_w-eS2p=-MMoo?d?GnU&Eq;D`S@n4Ck=&yosq!6m$K$bQshplPmSLL~b*4+=Xfu
zIH-WZSXiSLr(VvY>U!}wDz~v6_IOQzni2WIx%+)adCD{7fVo*jlq>R&j}hpWxo{_1
zR@V%{4BVLk(A=#?I~`!W&ieS`*=?GKbQVh+_~KxzYnEbucC1VFJ~x?g*Yj45CHoWO
z!8r~T$-ouTr;oe?kmM4oF8QAtVd^X&G9UUt_IeC8+PclkaEn#&5gFKAEnA$rRELf}
z;4_LDtjcj#s9stoU4;LeNA!yUH$XEs$=WE7DoPa&V2->-VDSvo!cZD1nFS2oDa}~y
zh!bmEV;Y$jRA4RGecFtHf$fCB&1Bkmuoz6~cosGp6U*FV(+v5>#990FpO`#8>p9<K
z1Y$}W+z+W^IPIr@{n*v?^!`>_$5UIao%DI1@o-u~eA|ty$a~gbm)kJ~k&?;4ofH6$
zQ%E)^OoIdOHMWxpE@K|$08mLV=IiIl>FR-=f6gOtUjht!zt&s&xr4>_+g4y=T*q3y
z1FmX&=P+HnzMT&Cj=(`}!Uzb1rP9RMIT~Z6jj@x&$;rqlQxf!X=XunzNCbhXVnIV{
z5FiC@67dR7wV*|Cp_yYBEesH_dPR`jfAuVg%9KqMmh~91%$tR!STif2G9nqoCIpz!
z%mlHFB#jMF5a8A*W9l1o%bk&3bepRgMITT1pe{SQ(2(4mh-(fs7))L21ePG_j6jhx
zSdpuw;@KJ;)X0daDHI}iDXXVycM8AH$Iw8PW9_5g)SqT)f|nVS(O3sQMs-GL>vvTj
z)o_fk7h7-4bHn@1?^Vo7J*{~Lf_v~K;U`3fn#{_r0EfnIN=e==IXB2a?`x5gl2nnt
zn7qAAQA2a3j+2ocgxUZG1eCQ;i@O*i0T!4zfh8HXHe8y)(BU@<I&?{<<P*;S1Sti8
zx?~orP>vFRTfiQjE^Z-Oe+qj}D{bM)^V?8=tzCvH3{nq(slz;DP>(TV2W>FSH`c{p
z*jxxeRGaY{|2lqL2W>@Jvi1c^Of9Qr{%Lju?puIa7>&ORX;^>r?VV*)8w{|8@j{D3
zahDe;9-QC~MT!=8r#OWo!QEYo7I%jbq!fps#hpU%P(pADh1>f#?%bXIydQREXU@!?
zInVrssH=4uH!}*{!#&r$>wB@AEco$Pmi8$xH)nMM9Y^A+E3B=s*|zsR7Nr}Y%<GT9
zA67j+G)C)>Kar~rrH_p|b&mBzF&Q*$2L6Y&8dQ<o<mbcFkeT|fI>Qt7F2%%f3p@ET
zar9ChvrOy1H5Ni1wz!;_ltg;{@K+<w`TJFj-YI>$tg}2zhf*_G=caEW;PoUo?$c>^
zz6uuTmO;s@hGSChT`7t}7N0HZXSd~f#fu4M8>jawqbW~W33>rC@78L6KVN<bYC<(!
z2{Jo5$08x|CGVkw5rPHvY=Ze6G_$WQU3u<|1VN%`bA=RK!T4Q?;!NqqzLZJ>3%NVd
z!+`8>#cX<)e7v<~7d$vIGK$u&c{LjG(Gj6whz2#DE3lcU6<-Yxs$qbcOiVg$lkz@R
zhq;0y6!ZIna(#k>zxppLV>Ri^_acM}YZ0#Wo#c+JUjh#Hn0i%EnI!P@Zpo+~a%_)}
z^7tf}E5#3Ve*Qhkoa_Z~bK@2MI&2C|`~iByXV@X#HE+_r?M!bfMPToRj%o<DjDh?a
zbPG6#Y@BeXn5)3^IronJ88~6)<I@xU<_ydU#RIB`d+`$+<IWz%GsqDZ3_Q;`YpmcY
z-HFZ_E)Bos3^W-`3?arhRPT7H6k`mUF5Z#QE^}@q1KiR$Z;~c=KqPMxhX+QzKSLV7
zPc#%^qUU5TvhtcB%zJ#!+Bg?~n>}<p06uNn6FLz(zN<+cKXXQ3-fGYrIvs-3V{ow)
zz4d0WhO3a0t}DFG&kIUc-zv^+k!h};)OXNM)`8|XC~ILjXYTbSv-kLPhdQ8=)Ew4&
zHvHx>2tIKW-UjAInT@2fqWsrw6wX1-VMl|_)Y@MjZ)I6=z?$G<Z}ClI<QW^lT$vu0
z6wNR2r?EWuDd}j;P+~gP0U8M!s~r8Op?>Z6j3%T`Q;<nlVSWFW2r{y)Mhy)L6{8-v
z!Skh3;WVY!9ogoKp|Wwpw)E()q|L6CKuHD2$!aJC5)1I?3RRw=&=HgeK<qP7GoF{P
zR{E5FeaFmwIcJs*yd;pmlH3V#K_0t^qzxT2NhtaLP;_-iRHcBP#89T}*CoK=4DY-7
zc^S>Im;@{ED#0J5S}zKY<e101plFldb%G1%{)K1Xssz8S8Ho!6R!!L}B)>X!NTL#|
z1I(}6HpVCSb$|4!kTB(kC*2CgOE0LnCB;5EZ?Ls(WMh;}XD3`+o93sI>V$2%_-w^`
zRu6|qCRmc`aojxS?mylq#J`!V`NSecQ&E)2oW$S!{GhI%E-!iMTV##N2cccU{_3`X
z!vCj|;AbU(%FDR=h&_ZS8l=A)%D9vp$LOy?deO3TZE*ucOb=&Z<S%ZHZ2$mQep+~C
zJHLPN6M?iChXmXxRP23|D9|&~e~;i(9WO-7qhKpj)^27F<m!Z@4AD4~IY#VKAhp6~
zu3>@%JzxEGXKYmD)q6SY39w}!lWQa-Su>A_VKrH_qhO>+PGwgR>FGam2ic<%HR(>z
z>dRD!(p)PA#j+h+Xh88Hv3ka};P)*sN*SxP*hA+?mUq7^=u6vYRHnbQTzkM+Xu#zf
z{3If!@O@6>r}9^+(lrF+StR#sFa>52FRzwYeeVj|(&$W{TPie@Bu}CO;ew2#9BHo^
zl@V$5n&b8Z11E9VSuCiV4&k#L#}gX49dxUQBIY|TD|gnuQe{zV3@|OqsCwa`&@eFb
zg8ml*RaZx9-3H$#p0dwx$04qaiwR2bcgMP7<>K0}-e9Bm!*4<C;aybqxntUv5k{uH
zePO_WnL}`OajB~!6}#n9fmK+=C&{0}y2Z6)+KJ(`(^7Wm1EES8_^wK&JuI!!auCdd
zqppee|9#&@A13?u34AP(K8~7@vX9*_v$?XYaV?dgcOYDfJG1{c#QW6HemVm5A0J-_
zafFo77hkJoiVOPAIn>25h|l_%4yavPvI62)m)-<6yh9^eue=}IVfe6W-V&;U8pVRA
zJI(Qo=PGz-v-<I=%NE!-pRAb;P3$DM*IN+?0w1yR6B&weJ$-NH`_vfv01m)7cL{x%
zrhm>WetCrmn7w0YJrya8c=?N&I7LJ@W415+$Dd^`@M2HR*u)<lz{xx4M|VrkB@Rn=
ze2QTyQ*<>oI^L_$^;_4$_R|=)f$yMW(^vw=EIgq`-UhQJzhy0u(<5fsju3N}gs(kZ
z80y90m5yZwk=pu87qB0xPW?pEY1BFI>K|D3LJKUgX=EcPkDZ(H_;JXJr7^W7Kp)Rd
zJ&^kDv{1E^L!W~rpHjN$pfn?T1jF&It;hay)B3qpcREuWg;QP8t*U^r(@K`6MM~T$
z709od2=$Iiez>K$4FBUB=V~<U&3Dxfpl89jpMNfS2CE-?nO-s1z-xC2(u-iW_mL6e
ztc2@n!zhl8s-BY+8?X6O0QvekEccBMRqJs`2m6p+2&W#9#MVR&VZAdgrD&1}{%E7D
zTDIEoNy27&SKyqrt3^pHpt&J4IyWuiSB&+RBpbrS#eOQU&K^XRm#JT%bX@AX9lpLb
z9@*((xV-VSaE86Pf?`t_vYW!|R^NvhocQAOvQI(OpW*XHdyLm~OI7+dgZ{mV%j8L$
zK^l3D`yV@3+|Vh7{KN@f=pOSg#~PsITPsJ8jZ!xyTXSb_sIPT?n+LK=)tr&KzD<=u
z4cX|MCLp^e!H9z4P$&laU8(aiiZ;(MlkVbU9hi*0!1|OaAtTUl3C^UN2Crzx>JtG~
zIw)6+QqO__=(7Hay`I9)oe%k@{VTj;%IwD&1_i!}*`A1-_ujcipUBkSs>%WdR!x!^
zQ~B1cC6)MNP!;C4FOsLmU!~^njd^pBg*Hd++kI6aZ75fZR}i%3&MFaFPU3C<rHxBi
zZQJ_U@s?s+{f9uDk;7o~XEkMcp!q)672KR(wQIfygXUD+WmJmj?a`MK+^+PW$S`5f
z%FJ)&-z0|~49d8jDrTA(bHC^q(Unb^*9Q56kv$D|=%nbB`tXDlSh~LTrOK6{R;mIv
z53PTyw%0e3a7>-~FT7(BqS@%bvtK9OL0iVk=~P|RL*D?2SoNsPQ_WA;cY;sly?I+l
zlbwP&>JQ#;UI#p%xJX?iq<r0+V8HC>kl%%~b!H#mj?MNU4p{w#H8d8Us#VJ{*_fg~
ztjEdK`j%%{d+tk@cn`czrp1{L`z51SAXcN}p+iWEU0Q+85ligY9M?~>z=G(@zdP^q
zZ$#YnEK5CZD{v=c^pl_W+!Y1}pIwPP)+4rEn)+taY=-ixeYB$XrHDk9>~Syq{?6Q9
zti#>$+@DT=?=BZ3a5Eg6h6Q+F(K69?6{1#u&V7h}MP%t8Dsj`|(+okOds&RNzNe>b
zM_w0|h~+UnqsI}cy#^cheqGv>%D4Ty|IOUAyPXpJ#@uA`n6P|_ouS0bf#PzO?abpJ
z;1!00btoNqx<7wDJPA;ft&7pn&-;PzZeN=`Uan5JIrYWwPoa|*at1U%SDw3QdulK!
zSacI&aT%~JZpLSKydbFdDQ8FMs*or5uyS$<+SM_uZJ>tch-MV}R8NN<*zJ5U<a3p@
z?U}0={mkU`PRhyts%N>THuGC|uPM!8w#S8{qBVhPm9ACaWQWH62M#??C8J@m`c0D>
zcNomS#LKqJ2)OK3@j)M<DBN!oXh0r5Ywl)XJi)|k*XM!~9c~Vn@476q@n^8lP065k
zk}-dp^B!=Il^e=LkHspA6pQ1}!eJUY{%@!({otKROn&0-gvtUhIpu$5@$7UE30xqO
zi_+dv36#ql4E6%v@4Q7jAHYa`*n3_{rVD(6{dDdYR6oa00CnjTFr92ODIC%d5m-DP
zT*UNmQWf?8%YN8B5g1x#-mOcj+GWJC6pXSZT@{bSSxxEpO0WKAB2ww*<4lwUJ2u#z
z&ghigehf0`{^~$n9aOO*>*wq}I$n}#l>=l|)V%W&kd*LR7Y}fRh8}-_ltQydW~tR1
zMv)bO2?IxfDUP0^xIUtK=MXi|St;-n_F)k)Vl5{GVtM)Dpk=}nGQFtKHWR2&vkb&a
zr>4KrT-@We>DwcoJm|<LhOmMuqq-yWLFs4Q$Ku?5Uz%%4|4iv+o6ZF2plzEe-r_r*
zjfLgbdatu`8ZA-LhOL-QzD=rFA1RXATe*U_xD}zVD6*j?v2w|aHXn|bu%_G)_Dx|Q
zeri^b_(&pm<{suM%1ckfSB1$G4=-;%h)`X;`yv4MdV(tF>8k5~kf$J=sHxIBq@dEG
zcF?~umBaa#=EC(H#+d|Tp~@ZGiYuaewC5j<!=22)c1`AZ^yD3D`E>!kiWe2>KEqec
zQ4`~3-up)jYo^uOwbh(m4fYOxfnJVWhty#hI6uDdme5OIJqZ9^+Exg*`@HvV&u=fP
z{ub^A_$;_9y>W<l^I04c&?=&kx?VilJ*$BY?Uuzvw;f6G74%&Uc;Hz62iOqhz(3Ke
zkub1(XOa^f<qdIxzMk`fC%f;-$(y6s0?2ggoSRO80RpcD$GLSOqx?*i`^1#>=2)F~
z_#KYS-OJZ|4FxZHymiK9`tjR*@LMgd4^$F3KkuD`pG@^&%s#61*2ZQfa!KJo{+(n9
zX*Xa~kKs4?R*}(g&C`sn@vb?dl6^mUz54x4{HphyuY|N=wH};*P>t<lkNbJ8X=V9W
zSO?Wz)uorFd$ukK3#-B=eE=8V=tR>n#w1*H-C(T5s_B@wR<m);jM17aP)5a96+Puf
z;4ALzhe0GjkP;5a!hK2M+fhd!@_FXwR*z&J+gIh1j|Jcg(?h0kr&Van{#x~qd-0^z
z6518|*-XN>obyKg<~EeIVv1i2G?G5u<AYbl1Gqfh1rQYf=~;&sy9r#o{nF88p(>rp
z{RfE4e-+ZtuO1Te*|4WW<Sd$jwy+*_NsTalcu*6%>8C|~qy7rE<RJZbZ7bs8Zl&r@
zQ?tr1CO?qBP1-Gznje1e7Ia=#5p_#|^#tyWCN~fAyg&AyMQYdcnd(FQQ?8;BCTac*
zRM$SzK3Gav`tZrlW8k#RdYf~mGZ(Ab+tuW~?2UTUO#}R`^g=tuw1amUjZ5xzSo&C1
z<_&}U{E!vaksp7H-gJMqMY*Do7<d+iNX&v0l|!m3b8ICN0EevXEod>GABv_ZOdB@D
zb?~@_n6(-^v^~j7_jXS7IW-HLG-Y|_pMAa#AIPan&KUJ^wjX?6_3ZKO_07swRQso>
z<YEDtQi)CHI{fpZA!*gNvl#k9CZziA>@rSs8~cil#YP!oFE%s@X_F1X5$x<S6x$k(
z415$#j^37+{`^nUD^<A)YT4{0gGC%lguvI?&?&9%@{($Qkaj1jr4=cXm^5CE3s#nr
zw~ZOc&AgafiY6O};CT1Ni?zrW$12POH@uaOgy(_91RJ+>&SNu|M4&I=^ZkbVHq|W{
zffsW$RJYR|d{9Bx-!HB5gQqol;%NK+L@U?>d3mtLJZ6zc<G=If==?W+mHmPh+wg&J
zFPNEso5_pgw?aieowQKF%f*P*)jlSbb9wqeHR~p$nnCrWmHK+MosExvtx}Im_DJ~+
z=hv)iN&BW~GLe*jmbDWDxaI&ROxW(%<FdGIEx;^su;o(=U3P-0Tnyim4~GzP+;t(e
z;;NKVj#ge%=0;+EV3=M!LFL0=rl_6T-u3su6t2lyu1-gYVEx)bh9z9h2SF^IWwH7L
ztUGg#g2y?@?#I5vH{Ssl57R?6`%RU98u^Wb1jDZVc?nzFzYeKQ6vQNJyY$tfM%ku0
z2#il6?2A{-B}&a%Se?zl;iwqSEwbMA7c}E9i%=|P<vDX@s3IR>W-~8kBauXrG^XsM
z>U;PA_-|NG>s1Dii*&A!n3VT_BivD~jneL61&G1eF!n|a%t0nkt62Uqojrj_%s9R^
zn*ruD7>NBjW>v<ri!M-<Fo(286bs{UW;%c9#_ukg8hEvlf(PI6e~DAJplhuAet*-1
zbM30-eTp4~n6LWYoLe7q@@$K2$!(?(<G-2+N~7F!qgV#A&EU~LOuZl4-*<}E9u49;
z%#q8aOy~Q_=fhpME(DtK39(D4ho9e(mn5pa7m&B%8rn2!(X`Fhs88&fGLBO*DKifD
zAh*LSIhmwWYI5=*IPN&3D-*TV?LycsiDd>k>o#n}Yr+bT1`cR&=4v<Dma5e4sS}%)
zPaQovoa2;X%IY>LAIt`S+AS)?A616?I6xNpQs>(1lr^g0964}#JGZ{bbGhU~G1|Ik
zYoOCA6e7Idejg-sN4b?{T$~cC9{JY+N9d8B`C^r2Rsd4^8ep3jQ!}bc$Em1t&IkFC
z38C!EKAUc?3KOq4f1e+OL$s@)4)W^7(u#K#_%oI`VH{l#g|;T%Jem`ND}eaL$iUa?
zyxl??45X0~i-DAZJVL2W*sX*_<q_Y2xMHJT9`tVf80ZY?Tt(w#kA}kzy({O6GP6FN
zeUB?9E6?}vEti#TiT1v$gpb=#YwkVo&zfGMLopAf^-|o;pN5frBWDw9*1^H9Hn90^
zkBi}%{keuhW%`{RiI?l?wdIx}{;WHn_9?oA)1-a77+7@D_*6t0ayynMxEx9?yl1e1
zEAqo&^IiT=Xu@ZFVVK`OUJ9vtygcU}2AbvG-FUK+u8!<%S5vAqVKDJ3NCGK#j?Lm^
zEpEI?n=8ln{W6d{-C%Kb>NExLRV>ALFhi-GKHndO96X`w_xH}8-P5`($vy!%_SqPb
zd@D9a<^?4C-2R8l&p7xdC|k2#oZ4SggLFX#RuEV?4yxlauA<NnhB%M#P6x(iWhyC;
zS~Qk6GXBgmyKnvWvt*ZsVUf7K_2od#b#J{<C*Gh)i#{|obqB57Yf-iO6V&rAwa6&M
z%<%TCupH^OdL=IO)0R}G+Zra4cvI6^u5$>9M)ACVbtAH9N4E1V@*5uZrCGuh7jg<V
zzD?Y(1)BrzQf=cee5Wn$3DR2jvq`k&HPv0fCfQC}yKa#HoN6uKmV$}3-q<_=<OD7X
zptVNvk_p!+H2!gkCCDOW0wCbLNpVqd%Qh{PIS@cieTA)?>`?4_rrc!5TMD&Dn$Xw)
zhJYh>WCt6Mrz`z?dVB<%*9cG;=3!skpQh!fy`%m_suW?NVbsA0k;)ov(Zy9avNS)B
z4xzP9Tx1xEBBxt80-3C&kG&$xl7JUB=<qa(Xvz5^PPKy^Ztr0BTqR`e%|quKPPCIT
zHlYz@E;hn+RQ6I}2>0uvD^}2K`)(VVZ#wZjshk0usK9Pra}sTF8Ks|iwU|gJ^2wyx
zQwtXPvB3TjB93=LuM@3s*4D%DR|Qm~UX-gQ-HqGvlF7-X93Q4g!QW|YRsO~3a-YSF
zT_B56-2-}W#(Xy)k5cg+lUsPT!<30KMX|!#Cag``=cJ~?=>U?##Tmj&#?G8#u4E=$
z*rq`-s+czg<M^5u<*@k?$BqVTH~F=s0-G`0TluA4Umcv5oE^b%$ezf4$be3~!l!5h
z$xghXX)>wGX{|D*>9YN^Y*>c+4J{nQgfBVy83XQ_{{RlT-uh~`VWt!!gi6Bq|NVb^
f17R;tXfGHwivRs^Wcw0?@~>1CH5D4=EW`f?&$gf4

literal 0
HcmV?d00001

diff --git a/images/data-schema.svg b/images/data-schema.svg
new file mode 100644
index 0000000..0c5e36f
--- /dev/null
+++ b/images/data-schema.svg
@@ -0,0 +1 @@
+<svg width="2210" height="1526" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><g transform="translate(-1067 -605)"><g><path d="M1155.5 959.288C1155.5 940.075 1171.07 924.5 1190.29 924.5L2081.71 924.5C2100.92 924.5 2116.5 940.075 2116.5 959.288L2116.5 1926.71C2116.5 1945.92 2100.92 1961.5 2081.71 1961.5L1190.29 1961.5C1171.07 1961.5 1155.5 1945.92 1155.5 1926.71Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DEEBF7" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1532.34 1013)">Dataset</text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1263 1081)">A specific dataset implementation </text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1321.87 1147)">(e.g., Wikipedia 2023</text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1792.08 1147)">-</text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1810.41 1147)">01 de)</text><path d="M1161.5 657.676C1161.5 631.621 1182.62 610.5 1208.68 610.5L2075.32 610.5C2101.38 610.5 2122.5 631.621 2122.5 657.676L2122.5 845.324C2122.5 871.379 2101.38 892.5 2075.32 892.5L1208.68 892.5C1182.62 892.5 1161.5 871.379 1161.5 845.324Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DEEBF7" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1435.65 709)">Dataset Source</text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1306.97 777)">A common source for different </text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1361.51 843)">datasets (e.g., Wikipedia)</text><path d="M1208.5 1726.67C1208.5 1718.29 1215.29 1711.5 1223.67 1711.5L1608.33 1711.5C1616.71 1711.5 1623.5 1718.29 1623.5 1726.67L1623.5 1787.33C1623.5 1795.71 1616.71 1802.5 1608.33 1802.5L1223.67 1802.5C1215.29 1802.5 1208.5 1795.71 1208.5 1787.33Z" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1250.76 1777)">JSONL dataset</text><path d="M1648.5 1729.67C1648.5 1721.29 1655.29 1714.5 1663.67 1714.5L2053.33 1714.5C2061.71 1714.5 2068.5 1721.29 2068.5 1729.67L2068.5 1790.33C2068.5 1798.71 2061.71 1805.5 2053.33 1805.5L1663.67 1805.5C1655.29 1805.5 1648.5 1798.71 1648.5 1790.33Z" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1739.83 1780)">HF dataset</text><path d="M1258.5 1838.67C1258.5 1830.29 1265.29 1823.5 1273.67 1823.5L1696.33 1823.5C1704.71 1823.5 1711.5 1830.29 1711.5 1838.67L1711.5 1899.33C1711.5 1907.71 1704.71 1914.5 1696.33 1914.5L1273.67 1914.5C1265.29 1914.5 1258.5 1907.71 1258.5 1899.33Z" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1316.74 1889)">Parquet dataset</text><rect x="1260.5" y="1217.5" width="318" height="91.0001" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1299.75 1282)">Languages</text><rect x="1286.5" y="1329.5" width="250" height="89.9998" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1325.02 1394)">License</text><rect x="1300.5" y="1443.5" width="203" height="91.0001" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1334.33 1509)">Genre</text><rect x="1565.5" y="1329.5" width="390" height="89.9998" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1604.6 1394)">Quality issues</text><rect x="1610.5" y="1214.5" width="339" height="91.0001" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1649.89 1279)">Availability</text><rect x="1523.5" y="1443.5" width="246" height="91.0001" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1557.35 1509)">Citation</text><rect x="1790.5" y="1443.5" width="125" height="91.0001" stroke="#172C51" stroke-width="4.58333" stroke-linecap="square" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="13.75 4.58333" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1824.93 1509)">…</text><path d="M2310.5 1485.84C2310.5 1460.25 2331.25 1439.5 2356.84 1439.5L3224.16 1439.5C3249.75 1439.5 3270.5 1460.25 3270.5 1485.84L3270.5 1670.16C3270.5 1695.75 3249.75 1716.5 3224.16 1716.5L2356.84 1716.5C2331.25 1716.5 2310.5 1695.75 2310.5 1670.16Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DEEBF7" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2562.18 1531)">Dataset Registry</text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2503.17 1620)">List of all default datasets</text><path d="M1762.5 1839.67C1762.5 1831.29 1769.29 1824.5 1777.67 1824.5L1962.33 1824.5C1970.71 1824.5 1977.5 1831.29 1977.5 1839.67L1977.5 1900.33C1977.5 1908.71 1970.71 1915.5 1962.33 1915.5L1777.67 1915.5C1769.29 1915.5 1762.5 1908.71 1762.5 1900.33Z" stroke="#172C51" stroke-width="4.58333" stroke-linecap="square" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="13.75 4.58333" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1844.47 1888)">…</text><path d="M2309.5 1906.32C2309.5 1883.78 2327.78 1865.5 2350.32 1865.5L3229.68 1865.5C3252.23 1865.5 3270.5 1883.78 3270.5 1906.32L3270.5 2068.68C3270.5 2091.23 3252.23 2109.5 3229.68 2109.5L2350.32 2109.5C2327.78 2109.5 2309.5 2091.23 2309.5 2068.68Z" stroke="#000000" stroke-width="4.58333" stroke-linecap="square" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="13.75 4.58333" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#7F7F7F" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2446.8 1955)">Custom Dataset Registry</text><text fill="#7F7F7F" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2537.62 2044)">List of custom datasets</text><path d="M2309.5 635.962C2309.5 621.347 2321.35 609.5 2335.96 609.5L3243.04 609.5C3257.65 609.5 3269.5 621.347 3269.5 635.962L3269.5 1314.04C3269.5 1328.65 3257.65 1340.5 3243.04 1340.5L2335.96 1340.5C2321.35 1340.5 2309.5 1328.65 2309.5 1314.04Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DEEBF7" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2696.08 695)">Config</text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2468.01 763)">A reproducible configuration </text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2582.34 829)">of the final dataset</text><rect x="2786.5" y="985.5" width="315" height="91.0001" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2889.89 1050)">Seed</text><rect x="2454.5" y="874.5" width="647" height="92.0001" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2589.51 940)">Selected datasets</text><rect x="2454.5" y="984.5" width="315" height="92.0001" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2505.82 1050)">Sampling</text><rect x="2940.5" y="1214.5" width="161" height="92.0001" stroke="#172C51" stroke-width="4.58333" stroke-linecap="square" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="13.75 4.58333" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2993.18 1280)">…</text><rect x="2454.5" y="1096.5" width="315" height="91.0001" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2498.7 1162)">Split sizes</text><rect x="1229.5" y="1564.5" width="390" height="91.0001" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1287.5 1630)">Token count</text><rect x="1641.5" y="1564.5" width="390" height="91.0001" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1715.57 1630)">Byte count</text><path d="M0-3.4375 91.2022-3.4375 91.2022 695.102 28.1814 695.102 28.1814 688.227 87.7647 688.227 84.3272 691.664 84.3272 0 87.7647 3.4375 0 3.4375ZM32.7647 705.414 5.2647 691.664 32.7647 677.914Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(-1 1.22465e-16 1.22465e-16 1 1160.76 751.5)"/><path d="M2116.5 1440.06 2216.61 1440.06 2216.61 1578.37 2213.18 1574.93 2286.94 1574.93 2286.94 1581.8 2209.74 1581.8 2209.74 1443.5 2213.18 1446.94 2116.5 1446.94ZM2282.35 1564.62 2309.85 1578.37 2282.35 1592.12Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><rect x="2786.5" y="1096.5" width="315" height="91.0001" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2826.3 1162)">Data paths</text><path d="M1212.5 1993.5 2039 1993.5 2095.5 2050 2039 2106.5 1212.5 2106.5Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#2F5597" fill-rule="evenodd" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1323.52 2073)">Extract text from dataset</text><path d="M3.43748-0.0118472 3.87169 125.974-3.00327 125.997-3.43748 0.0118472ZM14.1683 121.355 0.513189 148.902-13.3315 121.45Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(1 0 0 -1 2789.5 1865.4)"/><path d="M2792.94 1340.46 2793.73 1416.98 2786.85 1417.05 2786.06 1340.54ZM2803.99 1412.29 2790.53 1439.93 2776.49 1412.58Z" fill="#000000" fill-rule="nonzero" fill-opacity="1"/><rect x="2454.5" y="1216.5" width="469" height="92.0001" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#7F7F7F" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2508.33 1282)">Filter thresholds</text><path d="M0 0 0.000360892 31.2259" stroke="#000000" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd" transform="matrix(-1 0 0 1 1626.5 1961.5)"/></g></g></svg>
\ No newline at end of file
diff --git a/images/favicon-16x16.png b/images/favicon-16x16.png
new file mode 100644
index 0000000000000000000000000000000000000000..3bfdc3df918d5351bf39a875cd86df4467effb92
GIT binary patch
literal 656
zcmV;B0&o3^P)<h;3K|Lk000e1NJLTq000mG000mO1^@s6AM^iV00074Nkl<ZI8UY1
zTS${(7zgnG_pQ0Z*GT7V9`Jz7VH83q4Jozkq{5Q&B8uoHX<c+<3?e%S8EF&}Eh!Ni
zDTH2>U1X)yv|L+YbfIQaWJ}T6(z(?RZr}U8&5NQ>BP5@T7akto-^=qpJPei@%rfs|
zG4Q~X0loZa{^+2zy78vDtkdH9*CS$}=#G3_DK}o8%Eag1Z$o-+o;l5AI=N(^wsc3q
zq0))oWL+i<7hKqtm2leB`m}z@fWINJgbk5SNfM9dFL|h@bu*rn%zBU?W*kor{0$Pd
z5V^%wu_?-HnE8&!Lz{4YgAv_B>j5u6QdHJe(X+=ixDXKmE-lu%Lg-(mfJ_ywIJ2a9
zq|1UlheJzDL3m(TUWgl)9DgE<rf^xQ+~^Jj8sdPhdMOx#4rq7BqLtI*ZBmBIuUK6X
zw3=<d5D_qHt&wZhI!(`KRl<voj=f*|Y~@CdO5c9~a@h2Y_H1X>CpXC+Ri@itzq3A^
zM}7k%U6x{*Cb64PFw?#{sAE-_3-bsCLr}jQM`GI;26oCo3sY&_H~U(BW%IBIm`|>@
zv|0FY<cNX~jY$L{Km|&8!ZXZ-S97A*tDT}g+}q;AitzO9Q;koI3jyXzlld9tmD>d>
zK%f+a5JVyo%z8XH^J)h1Yz*pFdht;!xp=p!@w^B)w?BXLn!_blsZ7BM6cC~ZAwm@3
z;hXh?b2xD@$if~2*wSRhHMPypMS!K!VAy?Z-vI(vLI}@Bi<CwIJ|ZCKpTpFooA&qF
qc)2Q;1RNu0tKSSyF8YK2+037M0QC{4HJSkc0000<MNUMnLSTa9FD|tJ

literal 0
HcmV?d00001

diff --git a/images/favicon-32x32.png b/images/favicon-32x32.png
new file mode 100644
index 0000000000000000000000000000000000000000..d6c2687b5def4d491b67719a066c761869a25dc1
GIT binary patch
literal 1685
zcmV;G25R|<P)<h;3K|Lk000e1NJLTq001BW001Be1^@s6b9#F8000JDNkl<ZSV!%a
zdu&rx9LK-s-rl~uuHDAgc7hBTY={upG#Vl<GU5{<!GNGcQN(A|5b+ru0Wqiv_&`Jw
zA`<yylK>-t89^46D2&MlbYn6mG6w6)*4KL5`}W*zsF<;WLiop%oSu8my{Es=@Av(k
z2f=kN<T~5O{{UAOfyTrzq?;}9paq1;S3|zk0OfuoGtu*k>M<=urg5k%+)W<)Zg=3W
zANRj;|JL;z{wqXE0J31$E;w!XOm=jj&=|mz>pnsjBf*wGTKj#^o>4`>$^QZ|IKYfA
zM@-C@PrUW0NIa|;Y{(6<z7EaZJ-D~5OsT1<DSmkV{KMA_z)`w%ad}D8hc8YKfLQXN
zS#HIK${;pvuENxkl3M@q6DumVZP|9s0M0+)c+XzQzXpo|TABqS>%h3zJ7I=~)l;l{
z@7<k#KX58}%>W`3nB`n1-o#K1T!sd<8wi~x_?91ojn=uSVN0qtPFjquCCC0P#GpAK
zw%)u^ZC1({s{+_$kaQVG&L?nNtQI240fN-5q^V%#9SBEW$8+AMD+L)8KyJutBfi{R
z2vI=nffkJ7>^CZkCm%zgX9&PEKtiJs63~&5;E%tAoufay5{SV8o*?aHq~;}E^e?F`
zL}+aXT60_oJ>kPU4YRPjX%dA<r|(UGE2-UfSHsuyweO!_R9dNjow5uH(2<QVSdJ`&
zs=?|24AcgpYA8IgKg$UAaTa^HF*p?;3O1d?FmnXM?IA2_og3(C4Bdyd%j!}Q3<}UV
z44$*+$ib+j<+@lP-2gafwa{m~1!z)@s4=^dpO-|@9SM9WPr#>+2LR0o667nC!!`3z
zQ#0t6opLAKVZqQt#Qq7-_e{^A?qt&HqO<H&k}(v*n@yFm5^QR>V~24R8gmK|b-SRr
zSrCJ^F?sadQ}9=cmx@G6fD02B05b*PS?c_qR0#vMjqx1tXfExzfoK>wdWu1mf~{m6
zXm^bQpOK3m@35+v$}h*>^|b>*TsFXk$qxZpbkbC6dsU1+ugWB0AFH|cg_&Ou!k1qP
z2l-7n-@G23pBRB+3v0T&+nb-&m6LbK(sgE9U2EM%$$3ct?a(?dVNiz{%r=jr*v-3+
z2<~*ZyRgacFFo1Xs#>G*ph(o%$c$_!`^!PCSu*XN{<E-t<P8Pjy2LZawca@fFI^mh
zaK3JPeQ5Re<2QhFqVn|7ixP7{fWzC?EZ0nyS3xub2!ljYMMCD#*wTiH<3`h~03e7G
zl1UNm9i6x_HwR);K+)1#*z8&0I+Hk197lL+4vZWNImv6o$%z?bU#P75rwjuEv>({A
z(%~Di1cWM1_kF!+^h-$+VK#Bl=#WN#^j4;u5XW&)Xv;3!2lIh0C{{C~d=#hOy%lax
zR_{FLx$4=TS*t283!uEboZ$*?dZwk5|CkzLG({3DiAbXzN`Qryq*yZLQt6FSrG;pU
z!tkPK&G++@pKei|Qg#{)!6bh0HR5!k3aAMviBNNKT6F68_gl{V9paKXP*G9gOEa0;
zL{TD&qC%nWs&qqA`&+H|S=9-8$3^O;ArL}Y)j6n{HZZ(`lY9hgZy~Uy*}?O16iNGE
ze&mb#)t3cu=VLQ&oRB@O7OWwYra_+!{c`km)P5Q0sH)H~3*ciBEN_S)*JOf}W&#O|
z_}CUjt;eZvrot%rpI!0Q&Wo;b14dxdx=CD_b50=J;dRM$rXGP_Z*}kgj0Ek|aamTV
z_Y(CmiBKqlne{v-2N@`K1K1t~jY0(FV`=VWSgXGN;PVGBD+f>xX2X;<$A)L;74#+C
zg;ea1LVp74iKt7UP>7@;A{>rE^|xbobpSR|->+Fn(IhtAo`#O`&h2kM^5%?`08276
z9p;jos`G9iQ7jr&_AdZJL*1A8eNmvTZoD@NDkOnMAub{si6cEGW6=;hvKfLHl2{S4
zgZt&!yp{eFn^Ob0eYr(XPdRztyi$6i=&Dnd^p&W4-PiPKe*s2MP+BEPhCr_b@g5$X
z9YGvF+6+0|3C=*k4xfln^}%_s{ZyYCKqoBE8<StK_!%GL%;1PB5=m4Ink@9DBE$5q
z{$58IeTCBFS?}pG$bw9BK}3x2Ry*3-q%*;wWU*MZ!h&&f^Ow6DHU*BvQ$}E*1LInb
fUGZUbwPF4M?&=9n;B1Ra00000NkvXXu0mjfc^xNt

literal 0
HcmV?d00001

diff --git a/images/favicon.ico b/images/favicon.ico
new file mode 100644
index 0000000000000000000000000000000000000000..3cc1a204f6de8e5716588bd609ae7dc3c0734e43
GIT binary patch
literal 15406
zcmeI22UJv7x5v-Wqhc?h#u7{HpCyVNEGU9vi5M$NvCw-5hbkSU$_!;FQik57Du{s6
zL{K!?jRmow();}18BN}|US9k{G`{b>x4dip*12=f*=PUno|#+rK7=rcAQ?WK&~^$r
zAw<YBLdcXUA3o0-K}Z-qws7IE{WA%<Jcf{2bPP4279Wn&=8JBMN5d6E%AZAql|4Bd
zeECrfyYx|X(4_~_XA15{duLpawq)g<l~#5R->&5qI%~0`<nP6-*lDxWJ}IWnBQpUW
z8C<xh#lkiDES$I;Sh9R@KqUl+_O{`Waz50pnCpLUPd_vK*lq=3fvyM%a7M7d6M}r~
z;pbrkdn+C2uX_&D4Lvx%-5wgpl<oh(p2S6Zgr5y{=?@M%{fKqO?-tA7??!-+ciky7
z??$r~-OyP67zgxSA*bOH^m}_wUj2sbs}FfOSMMXOumT0w?jf)ICeklnMWDk&Sj_8$
zt$qbUlFN8%!MA7s-k$%NS@=*as<wS%YHHS^BS*|FW=>ZrpRusKddA?$>J3A#y_vDp
z^2pT5dtGE^FWde{_WWam201Sw#PAX!_B<lY1)1H6;iDt4Y_cr&EFbk;TSjcv$85>C
z=kv(0Yh6U*A&BH_2(P`0IjZ(pdfE+!0rR^pb;$CM+R~1}K|O|JGWsDF(T~-pnK&CK
zi<>qCH9Dl;Jy&|(zp~}`NqDzy8ROiE5#c?Mvk$~IyAe$-!Nl|#iwycRTYmpKd55*L
z{2xF*rwfavHeu62ce8=m{WgXpO~klvt6wL?dSbUyG2GmK@%ycBpzYffGNq%AFe@@v
zI3AYIlo+Vp|J8B)>&z62426Gv`nnA~0<)$vXZi%ht&56E*c29#xH%*wMj;?5dK(`<
zcI?*Ezv9b^SMUvpp+qTo`EnH8-9r={?R*uCOdYrE*{{7(X5$WNdBy!}O-&tEE*N)k
z@IVwy7->3jzXi9ObF&>`x7xwE)dtSZHiXsjX+_wL7KC1JMo7&&gj{<Ic2yIC%3s32
z<Qcs4AHXT)2K3Kdz!uE_EZOgaACv=ejLE{n<u3aN;(ydA6LFc{HdoZVIDIMsY^eCN
zpyC$<RlhS(@d<>==>Qz@_QMfRA1J$f<DiQN_B*;j$;JWO4K1)yO$#!6tgu!4F}CS;
zLss`Sb{;as^f_h&^8a2VlL+`1gx$)%Rfz1H1<0<uKso=Oth#)hyFt00hs@esq+iQM
zN@XUvC5Z^h41t|18=4!Mp|hqB`dj*NVpS(hP1taXvQHhz`SZstVJ#7Mk30P)`dl2&
zW}HQ2Y6QYlIN&6QA}lEwp$RMm$DaW!HUNQ9KJW>1hpWFGEFJWr|6>sh*Y?9yz6VCM
zRhPN}Wz`>X+<1T5K>hQ_Z9gnMCBRJY728rDY-@w}u&nj*8)vNa5ooE001I9Cn(4sZ
z@C2-nDnn=4V;HaD!AhYUCadUub(0575B~(yLrPr=hb&bGs)5VpPDoA7<|QViBR(z#
z+}K3KMaLudY#d@DV-Ou4jVMkOB0_0wf;nIZvf#=Lh5n*mSjl$7d{r+r7S%)ZxE_ta
zCA75k@sDFs`UCOLWHJR3xK3J;en<Kve2*Z+OS#L_>3FrPlTo(IDYF#UQ<e#?j+R{7
z#yUHi#W~uU#-Fs($u!#T`_6D)FKjn-z<fau6t`T!RwE8J7;v!YcyRr6)7psx@t;)w
z)V1bm3vN7X!S&y0Xhzj9uTk6BhT7+?sA*_H&9i1yK5C+E3vJ)RO|1oH^LoHs)(L}o
zFA$eji3=6?kzHC3{e<Ux2I8AP#y_RnF)XXPH#DajoZM=Jrj;Tx={%fVeUX?|g2e1n
z#HJM>f|~_*pI}5K<s&5e9FEU?0qezWFrG-CiFRIsT|hb<**UQE$>1sZ)hQ0tKYtvn
z3c_cuwNHrnx#x%Ex{lQ`F-eL!ITy@5J^e3eX&K}nIb<25tZd9OHDJZb$OOl3UTLB(
zBUlWLDbJxQ{_2U%*_Kr{1s$tQvb(;QJyAM#lBCj-`ID6BGtlR5yXjN3(mp~CG!n9{
zpE3Gg6B*aZ6A;a+|5ntNH%Ig`#**lIdgs$3WK+0^;P%G(->&IbTspga$#+YJeY$s*
zoz!POA<Dgk90eh|Ai|*_Q+q&`@Ia=$g8)5$0pW8PCb||sNKC_yxsuS89g=o*16lgX
ze#O!!4IXroUOP;vPZvrE2qOtZFc)OVFVt@@81r94e55xPORUF%g%Z$SKL~oeMWOSP
zXt>fGBJ#<;C*6`{2<JZ`jARg@LJ+ZQ)aN~rVb4G$njtc_3=)%EFkB`Xq9-3i%2F4O
z&H`{dB>-CkL9Vnkp>IH+v5kAqW=wnqGM@5%JII1=khwJ8Q#&ED>^#0XQ;0<&me}sI
zAG;$CL6&QcAI^os({E&>jRyJb-x@=$_Ze~S2QjAKuPNm?Wh)OO)bC;5l}gwK$s<FL
zf#PEfRGeU-$b!N1%U=<tl+XH%-`PLoHKwNb$)eZ&nD(q6;~w_on~FY&nm)$e^l@m8
zA!rRDXz?Rx@*;R@N_y($iRM48KYwrj{ho-q*<F}b)Pw0&J(zN@7vsvV;&HwRdd?H{
z#1eExlK!@6lG+eJ{_x%Db5t-EHaiMz>;OZx1H_~kM6DZ)EsdBh9S(;r>rkw~$SvGS
z{`CFjQ#FWhd+jV})&)`9E)4hT#J4`p5FIoLB7<gP_?C0$Cley}sdEp+&UT}?(cV*@
zSXb8vwfp^GH}t{DhYcyIH83-?t=P4F&yIo2^QmLPcaUkpri*!jH<FQh>k2NFR=_);
z1P%_Nc=V{DFCrpl(WlNmP`fNuMplsl1Kg8q;OlrC)4r|5q{)vUEBlg1NY%7~&hx3`
z>X=02O}c<pTLpR4^MLSPTpNLK4}->|J2D3REcL0re{a`ulT5}lG7I!-{QJJIpT9=n
zYXts!1o(N9g_I+duXz$0gs<DbAp%N*v&Nf`Vlp`rQBg^fAu%!IY0hbUcsO?gn;kWg
zkI#Q2>AxQHp=16N_Cyz#4=|ZN6D=)0Cm0x5PT0FwYrMR?l7!UyZR6Hc4xTU?YhmFu
zX8Lp;(a$((#^lMya(fICQ~h&qUp!m(usHnkz02XH_bzja@04?jZdZn0tgqq|!l$sl
zD){2<N_GLC`bzeNTa~Qyw<^!%*Hxa$rDWGu24r5V^i8R(Je_#C(k=Q@g?(^Ng@GHl
zY@eQYsmyNuqNSS;7c7@kxv)#!`n;K2c+MgzTa{1MFDZF!(EL@#akfcyys+!72))sY
z5SnA-6UuL|^P%~++GYgTzC$q0yRmC%-tF2Ou&*}J{M&1=C}%2OBCzZQ0!ka;Tl56p
z=O4f=>oy#cYGD>$ilgr7kk|BslzJo<swZQy&IN2Z>JQyh94uINiu0-Z2{|S_W!7G&
zLw51+Ok>OX&5|y|G_jOY0uw$7#V|{_1k?B;m~ts`7hw`x2;-On7)PInQB*z*BXVKD
z$%1}pCUn`U&<sezF|QaLaSFvxmcH1k=>*wBPFSazhmAJfSZ~`2X`42zQ;o(!T}RBE
zYxNoW37I-|@3@6al=p2j)X3Li+wcrSZDAC`gi$b`lQ3daST?_64MP?s$O;C5me8l@
z1(-wE&kQ;~rqJ{<!f`h}s5xljprs0S>FkC4PDA{tkdMQ=J8)u4FVuJT;^5Y9?A~@B
zNA*phq-~a`=&C6G>E8h%az1jRr&7J1Ub<hwD|vVor4Oo5@}LsM_bc90Lfex2<?r_|
z-7Cj??v~*arRdHTT)cf5h4m#kf3pZVHRq9bB?HlU(eRH5gVD}M(3R<d9?d5j%Jo8j
zJr6podvId&HMn|vA~@0itwFGX{AcNB$_=$Y>)M)nIUDJhb12zJEy+YmaR!nvr6Gw=
zQ3?`^l96yR8S#Zlh%cZy!3#7en9oIYZY-j*qQFV#fRzx0)8U?Q^tFY#=^5y+>V}~-
z4@T?wdC}hY`BA;KEl}PZ03BmBn4i*VI%aox!{_Md+aB}M(6KUAzspqrk;hcqfs?A+
zVWX-DYn5%3t*|<>6_$q;V8KUO9%hH+XkK(POb%><(f*Av*t-F`N;1&gxf-hSQrNeC
zA&$yKLw|WMjH!O}E!{Aa?S;wOUa0-!2DU6%iR~-D$IewVU+$bgNqy@Q$pO{FK6;)s
z@12~Gd?GtNqct}p9eEiU$j|(Zbh@90{EReuoKGrp(^HX?o`USOWMrpOQj(C7l!z2=
z9L>*$Bgi)Z#xjjC;Ok$-gY8f4FkRIL<K<n@Q%r`tqZ0x=J>lc(#?vx(Y~7$2EAvrp
z1L$C}Si*Vvm;7_{i+MS@Maar3MAo?rICqZb>ofC_nVw6@Mn+l|GE&bWJtY%qNf}5@
zOh-yWDw5+<ki<;}m*$znLnCqeR2+2v(F0?tK3L0j!FnT&{|fs5ol_6p6J|JRZ4Y~U
zXINX?VV9;$`=VnpKMX+TM~ASOTL%R=XmGvl^mv{pwc%!?30JEVaJ4wr?`(0r%h61u
z#okowjlGFxlbx}~YsxEIBaN3#!xJy9^^d-?(mVRvLPx#HL`(UdzWSaXErn2?_N;E0
zF6)Qmk8Lnt+y{dNt&m&dgU#EOp`fIOt$WlVw@VF6_n#`Cas2#*kBa+~j$>6-!e>ir
zkLISIe3_qa$;;2MLSDKBFC)e3b!6h{+=$fVpzw?=pQv*=UNJcZZt?k-oD=dcol3Y+
z?3h$g;*gYg$$^_+<j5x~Gtc!5`*ez#%++??@4I0m-HnqgXdPuXttb4@h~1_YIOJP}
z0|C{Lb*sWZY_HVI_}%!CC_(T~Wd32dQDn_vZu#@H>Zh%^L38ULXv1~7|DMJVZS}Kv
zsD07?0h))eq5b?^d~HK3s-L{2IeD6wr@8p*$IU3HY=Zgf4wz8Qw$kmergZ_cIqhH`
zszXZtbre)Rpgcro=^f}KzjXaWVFNs9oqT(CcEy7S<@F8VQ}Oe&_bq?Fte)=QZlJ#E
z5p*^M7584E>fvit+<*0czx?hC<W>BP;J8d&xzmWUJ1<ajs{yf*&!IPm))Qv-f+^Jw
zvuWMXpV0_c+Z@EDUq)))RV3$BAtt318k{@a0ZRSDVf^@?3@r$@KYOvwA-0%jA6EiL
zZZVu_YZn*~SKn})3FX4Z#t~=2xCn_%K@cYqp8g#8vttp+j-ojsXSjH>;Bh(>E?&XV
z-jxHDv9Dk-xdT@7+hIJZ1BcgCL&GTn+8)W!@<^rk=@cm2C-iUiykYW(!v5kxX=b62
ze>o(=ZgnpX|G7&svaVYty{=g$IXT^&n~?0Cm6ccK?B@N%&d#OD`lQqSlUDXObabrh
z&CPADWTl;}*}hHtj=GwCrna`lsUJimGPjG|rvKLlXpCxs`p{P|&D>rbVP5D|wJPX1
zY@XjKXPDkKYag?I^r%sa;&UXGCQY2U`<uTg`7aHB_wL=ug4_Zr<pat)XHK3;UtKi~
z2-$y;5Cz{+W8;iO#Gls?;aBGb1+%tE5UV@OM3S&?YBjbBKf)@(64nQ2(llqehvrOG
zl!XM;tcHm0^O!SvQ`G9Yo5N>Mk*6umFTE+z-1t8Fysnf)$nI8J=>)Cg@B~Kx+)2Kp
zIo?S<JTk2DoiI7wKV^swPf{=oBN>$tBIb>BB}o(^(hnG9<4YliY{#^5%lmgNo5oXH
zH>ylYYRqb?obhGUtI(R$!EKZ{s=b}YoN7KyQKx8vAwugwwB|<U(OSV`S|gZE^VEZ@
zAtY1+hUh~G2zWqDa4klP&c)Q>v$65}DbSQ2`9@z(DEi1+vgXTZrq85uKB9s|LxJkd
zrgRdbLG>C_m=tG<C#@w>?SpAeXJXd}{VRB2%xeNeECGYX0x*4~7FJ3wfc(^nP+K|*
z`WpsANA{b$T3bX_ln7k__r<mn@sT6hQ9^=N&4k!ct?m>SC5CFw01>!A>u*<RP3mWm
z5ltXddT1T26O2jMF?5I(7E7+jt{GFIyks~|Z2AV8J4B#+PzYv6g`ey17u-l?3w$x@
z-`yW3G_)v<Fk-3R2&y-Q*6a9s%c$P#AcJm$h|}8LNLtsDd<UU%_c3^UI>bl&WAp-l
z3_WrcVpet7rM4I*+Ja!(2*BN55Y85YP3D>cwD$Q0wG#+3#@=xwA~ie)zuw2!JM=!t
z=muKzdkr$-9o5`SX$51#3kXln#&88MOxHHRX!R(JaH+ts&}$eJ`!nW72H<egPw?W5
zfUg?^X2%3*9rH_Sr?ZcLyNF0MJtQ;Qz?jiW>xQ&;w2;;em-T@xr!1k&q1Txytq_@;
z4$<K25IKJjD<ZAwyW>nS{l~z8Jq9O3CPFWK1y<w)VSSnl&&y+Ift5Cy`o(@TzfLT$
zrtb*Z)7wh)`)R$AqQ<W~{@U)N`||Yvxw;oac3s8j(#M!yRt-~sX>e>Ah|y;tgVsv&
zjTp$WVc=XK;l-s7Z&0k@nC#|ye$ji=Z*dWE>CqKV>ggI<)(7#zeu!lDLnyr;j97XO
z?vxWe%+siXQ<?!nvVKBt<O0;$3gMms!DAhQCmOU>q3k66*Y`6DujGiz78XkXkJURy
zUF|zG{b_H@+LnH-p)7pckMCdgW5V-(47%F~A^$h9@HmL(d_lBu8R(#O*lr4~gQ7i*
z)??YU9_vH;ZmE*EGHEiP^(Mc??(b-4HS{c5@uKG!nMb`?`LGYmp7vo8tsl>C?8A4j
z`Y`OyOE|=CL~D@%x-U`fsRW(jRC_Stb)6+0?GfbW3l`BRWPaJ&=Eu%;av{Wq8NVwN
zQ*DRHzBY(yw?oXX6N9|FA<pW?U``uWdW0asbuJ!w3Zl`Ofk)PizGqIPs6CeGy@?=`
zJ~sFN$niNZhA^Z$t_!Sg2SbJ4Q|;)r#-3htj469NLDoKpz=9fxFDk);l_3aNJ+UEw
z136H*oeZZpi!b#2GVn8(-1=;g$cFY>K}-4!<Jbcs-42Lqwc;E777W*Y0WtA~U<gP;
zc<5$`ZmNDWT(Rf?o&C$*O9w9f{Ya*glE?-{X4O2E7<}vBhUr<oSVY%NY`oqF)f?T|
zvPTt?5)(0YtORDrXhbhxDW~(@ci;Uj>oxwO)Q=ALloN`#P+bAp+74{HP1i8ewI8SH
ze>kV94`p|sLRDQ0t5>fF^Q7w=FZaOf#}DgG`RHW-hK`V(L@`orG|#*^8|DvwLBQh%
zWZis#t95r#O5eFpG6NCJjzMkhZC+j7-HRMf{Qs);*sd#yc$lKV%XDK&6tJaGoS+T|
zy2dq_l>w<$)tEcK7+TutD7g59=jl}+I%iJ&kbgs%|E0@PJ+h<LhP2i)3Cfvck>t4<
zxoidOmt$k(h_@IyvK8Yc+Ib^}z6&9wW<YC`|E0sfxf3)O+88gmv-upOs;!W`>9|4q
zy6zC(n`@$oKCQviIGEe4tZ=?$`J5Ni|7O>J>vUARL`x;v-Fu(d_dX@jea}f^*Du7s
z`4&-ro=e95t<JvSWOU7m0PivRhF3!dqlyTBL04a={u+U=5%?N`uMzkff&VuV_z%WJ
Bdkz2q

literal 0
HcmV?d00001

diff --git a/images/pipeline.svg b/images/pipeline.svg
new file mode 100644
index 0000000..923ecfe
--- /dev/null
+++ b/images/pipeline.svg
@@ -0,0 +1 @@
+<svg width="4042" height="1146" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><g transform="translate(-238 -916)"><g><path d="M2497.5 957.169C2497.5 938.574 2512.57 923.5 2531.17 923.5L3100.83 923.5C3119.43 923.5 3134.5 938.574 3134.5 957.169L3134.5 1327.83C3134.5 1346.43 3119.43 1361.5 3100.83 1361.5L2531.17 1361.5C2512.57 1361.5 2497.5 1346.43 2497.5 1327.83Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DEEBF7" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2677.8 1011)">Data Split</text><path d="M3577.5 957.169C3577.5 938.574 3592.57 923.5 3611.17 923.5L4089.83 923.5C4108.43 923.5 4123.5 938.574 4123.5 957.169L4123.5 1327.83C4123.5 1346.43 4108.43 1361.5 4089.83 1361.5L3611.17 1361.5C3592.57 1361.5 3577.5 1346.43 3577.5 1327.83Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DEEBF7" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3636.3 1011)">Tokenized Data</text><path d="M3634.5 1081.67C3634.5 1072.19 3642.19 1064.5 3651.67 1064.5L4062.33 1064.5C4071.81 1064.5 4079.5 1072.19 4079.5 1081.67L4079.5 1150.33C4079.5 1159.81 4071.81 1167.5 4062.33 1167.5L3651.67 1167.5C3642.19 1167.5 3634.5 1159.81 3634.5 1150.33Z" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3698.68 1135)">Megatron</text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3912.95 1135)">-</text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3931.29 1135)">LM</text><path d="M3636.5 1213.67C3636.5 1204.19 3644.19 1196.5 3653.67 1196.5L4062.33 1196.5C4071.81 1196.5 4079.5 1204.19 4079.5 1213.67L4079.5 1282.33C4079.5 1291.81 4071.81 1299.5 4062.33 1299.5L3653.67 1299.5C3644.19 1299.5 3636.5 1291.81 3636.5 1282.33Z" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3710.1 1267)">Transformers</text><path d="M2532.5 1084.33C2532.5 1073.38 2541.38 1064.5 2552.33 1064.5L3074.67 1064.5C3085.62 1064.5 3094.5 1073.38 3094.5 1084.33L3094.5 1163.67C3094.5 1174.62 3085.62 1183.5 3074.67 1183.5L2552.33 1183.5C2541.38 1183.5 2532.5 1174.62 2532.5 1163.67Z" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2635.13 1144)">Training dataset</text><path d="M1408.5 970.834C1408.5 946.349 1428.35 926.5 1452.83 926.5L1988.17 926.5C2012.65 926.5 2032.5 946.349 2032.5 970.834L2032.5 1148.17C2032.5 1172.65 2012.65 1192.5 1988.17 1192.5L1452.83 1192.5C1428.35 1192.5 1408.5 1172.65 1408.5 1148.17Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DEEBF7" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1586.74 1017)">Text Data</text><path d="M2532.5 1228.83C2532.5 1217.6 2541.6 1208.5 2552.83 1208.5L3068.17 1208.5C3079.4 1208.5 3088.5 1217.6 3088.5 1228.83L3088.5 1310.17C3088.5 1321.4 3079.4 1330.5 3068.17 1330.5L2552.83 1330.5C2541.6 1330.5 2532.5 1321.4 2532.5 1310.17Z" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2612.54 1289)">Validation dataset</text><path d="M1439.5 1072.33C1439.5 1061.38 1448.38 1052.5 1459.33 1052.5L1696.67 1052.5C1707.62 1052.5 1716.5 1061.38 1716.5 1072.33L1716.5 1151.67C1716.5 1162.62 1707.62 1171.5 1696.67 1171.5L1459.33 1171.5C1448.38 1171.5 1439.5 1162.62 1439.5 1151.67Z" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1495.03 1131)">JSONL</text><path d="M1728.5 1074.33C1728.5 1063.38 1737.38 1054.5 1748.33 1054.5L1985.67 1054.5C1996.62 1054.5 2005.5 1063.38 2005.5 1074.33L2005.5 1153.67C2005.5 1164.62 1996.62 1173.5 1985.67 1173.5L1748.33 1173.5C1737.38 1173.5 1728.5 1164.62 1728.5 1153.67Z" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="55" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1781.86 1133)">Parquet</text><path d="M915.5 984.5 1311.5 984.5 1379.5 1052.5 1311.5 1120.5 915.5 1120.5Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#2F5597" fill-rule="evenodd" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 947.833 1079)">Extract text</text><path d="M459.5 963.773C459.5 943.188 476.188 926.5 496.773 926.5L849.227 926.5C869.812 926.5 886.5 943.188 886.5 963.773L886.5 1661.23C886.5 1681.81 869.812 1698.5 849.227 1698.5L496.773 1698.5C476.188 1698.5 459.5 1681.81 459.5 1661.23Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#DEEBF7" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 535.878 1015)">Raw Data</text><path d="M497.5 1332.67C497.5 1324.29 504.29 1317.5 512.667 1317.5L833.333 1317.5C841.709 1317.5 848.5 1324.29 848.5 1332.67L848.5 1393.33C848.5 1401.71 841.709 1408.5 833.333 1408.5L512.667 1408.5C504.29 1408.5 497.5 1401.71 497.5 1393.33Z" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 548.647 1381)">HF Datasets</text><path d="M493.5 1217.67C493.5 1209.29 500.291 1202.5 508.667 1202.5L829.333 1202.5C837.71 1202.5 844.5 1209.29 844.5 1217.67L844.5 1278.33C844.5 1286.71 837.71 1293.5 829.333 1293.5L508.667 1293.5C500.291 1293.5 493.5 1286.71 493.5 1278.33Z" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 620.992 1267)">Web</text><path d="M497.5 1450.67C497.5 1442.29 504.29 1435.5 512.667 1435.5L833.333 1435.5C841.709 1435.5 848.5 1442.29 848.5 1450.67L848.5 1511.33C848.5 1519.71 841.709 1526.5 833.333 1526.5L512.667 1526.5C504.29 1526.5 497.5 1519.71 497.5 1511.33Z" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 623.126 1499)">ELG</text><path d="M492.5 1099.67C492.5 1091.29 499.291 1084.5 507.667 1084.5L828.333 1084.5C836.71 1084.5 843.5 1091.29 843.5 1099.67L843.5 1160.33C843.5 1168.71 836.71 1175.5 828.333 1175.5L507.667 1175.5C499.291 1175.5 492.5 1168.71 492.5 1160.33Z" stroke="#172C51" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 561.139 1149)">Local files</text><path d="M497.5 1574.67C497.5 1566.29 504.29 1559.5 512.667 1559.5L833.333 1559.5C841.709 1559.5 848.5 1566.29 848.5 1574.67L848.5 1635.33C848.5 1643.71 841.709 1650.5 833.333 1650.5L512.667 1650.5C504.29 1650.5 497.5 1643.71 497.5 1635.33Z" stroke="#000000" stroke-width="4.58333" stroke-linecap="square" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="13.75 4.58333" stroke-opacity="1" fill="#F2F2F2" fill-rule="evenodd" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 647.189 1623)">…</text><path d="M1464.5 1225.5 1899.5 1225.5 1967.5 1293.5 1899.5 1361.5 1464.5 1361.5Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#2F5597" fill-rule="evenodd" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1497.24 1320)">Shuffle</text><path d="M1464.5 1404.5 1899.5 1404.5 1967.5 1472.5 1899.5 1540.5 1464.5 1540.5Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#2F5597" fill-rule="evenodd" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1497.24 1499)">Split by size</text><path d="M2062.5 979.5 2401 979.5 2469.5 1048 2401 1116.5 2062.5 1116.5Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#2F5597" fill-rule="evenodd" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2094.58 1074)">Compose</text><path d="M3162.5 978.5 3481.5 978.5 3549.5 1046.5 3481.5 1114.5 3162.5 1114.5Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#2F5597" fill-rule="evenodd" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3195.06 1073)">Tokenize</text><path d="M1464.5 1580.5 1899.5 1580.5 1967.5 1648.5 1899.5 1716.5 1464.5 1716.5Z" stroke="#172C51" stroke-width="6.875" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#2F5597" fill-rule="evenodd" fill-opacity="1"/><text fill="#FFFFFF" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="73" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1497.24 1675)">Count tokens</text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 316.229 1845)">The raw data can originate </text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 292.579 1922)">from different sources and be </text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 427.375 1999)">in diverse formats.</text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1432.22 1845)">The intermediate data </text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1413.7 1922)">representation contains </text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1383.25 1999)">primarily the “plaintext”.</text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2506.2 1850)">Based on the config, the </text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2423.42 1927)">training and validation data is </text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2532.55 2004)">selected and sampled.</text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3514.49 1850)">The tokenized data can be </text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3530.57 1927)">directly used by common </text><text fill="#000000" fill-opacity="1" font-family="Times New Roman,Times New Roman_MSFontService,sans-serif" font-style="italic" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3491.84 2004)">model training frameworks.</text></g></g></svg>
\ No newline at end of file
diff --git a/index.html b/index.html
new file mode 100644
index 0000000..f7c5add
--- /dev/null
+++ b/index.html
@@ -0,0 +1,629 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/">
+      
+      
+      
+        <link rel="next" href="getting-started/">
+      
+      
+      <link rel="icon" href="assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#llm-datasets" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Home
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+        
+      
+      
+      <a href="." class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="datasets/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="llm-datasets">llm-datasets</h1>
+<p><strong>llm-datasets</strong> is a collection of datasets for language model training including scripts for downloading, preprocesssing, and sampling.</p>
+<ul>
+<li><a href="getting-started/">Getting started</a></li>
+<li><a href="config-files/">Config files</a></li>
+<li><a href="add-your-own-data/">Add your own data</a></li>
+</ul>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": ".", "features": [], "search": "assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/objects.inv b/objects.inv
new file mode 100644
index 0000000..82c9dc5
--- /dev/null
+++ b/objects.inv
@@ -0,0 +1,6 @@
+# Sphinx inventory version 2
+# Project: llm-datasets: Documentation
+# Version: 0.0.0
+# The remainder of this file is compressed using zlib.
+xڽ��N�0��<E%�҉+G@�	<���n�)���ޞ���@�F�K9���g9��@-X2��E���.C���[�QqS�f��xmvyU\_�8e�4c���h�2u�&Z���Ll�ϼ�-�8��	΀��#�Ua����D	�+�Rn1$V&٪�Z3N���Fp+J��.@�x�l���BN�" �(S�rbI| ����S��-���������S!�f�O����_�������2?��q�4d_�cc~�b4{�K���لhj>�/�D�
+|p�<z����S#+M�t�Qmy�O�+k�<�n����@��U��/?a��
\ No newline at end of file
diff --git a/overview/index.html b/overview/index.html
new file mode 100644
index 0000000..1a47dcb
--- /dev/null
+++ b/overview/index.html
@@ -0,0 +1,703 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/overview/">
+      
+      
+        <link rel="prev" href="../getting-started/">
+      
+      
+        <link rel="next" href="../datasets/">
+      
+      
+      <link rel="icon" href="../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Framework overview - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#framework-overview" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href=".." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Framework overview
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href=".." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href=".." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+        
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#data-schema" class="md-nav__link">
+    <span class="md-ellipsis">
+      Data schema
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pipeline" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pipeline
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../datasets/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../related-work/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#data-schema" class="md-nav__link">
+    <span class="md-ellipsis">
+      Data schema
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#pipeline" class="md-nav__link">
+    <span class="md-ellipsis">
+      Pipeline
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="framework-overview">Framework Overview</h1>
+<h2 id="data-schema">Data schema</h2>
+<p><img alt="Schema" src="../images/data-schema.svg" /></p>
+<h2 id="pipeline">Pipeline</h2>
+<p><img alt="Pipeline" src="../images/pipeline.svg" /></p>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "..", "features": [], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/related-work/index.html b/related-work/index.html
new file mode 100644
index 0000000..4f8f63a
--- /dev/null
+++ b/related-work/index.html
@@ -0,0 +1,631 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+        <meta name="description" content="Documentation of the llm-datasets framework.">
+      
+      
+        <meta name="author" content="Malte Ostendorff and llm-datasets contributors">
+      
+      
+        <link rel="canonical" href="https://github.com/malteos/llm-datasets/related-work/">
+      
+      
+        <link rel="prev" href="../compose-train-validation-data/">
+      
+      
+        <link rel="next" href="../api/base_dataset/">
+      
+      
+      <link rel="icon" href="../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.29">
+    
+    
+      
+        <title>Related work - llm-datasets: Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/stylesheets/main.76a95c52.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#related-work" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href=".." title="llm-datasets: Documentation" class="md-header__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            llm-datasets: Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Related work
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href=".." title="llm-datasets: Documentation" class="md-nav__button md-logo" aria-label="llm-datasets: Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    llm-datasets: Documentation
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/malteos/llm-datasets/" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.5.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href=".." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../getting-started/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Getting started
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../overview/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Framework overview
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../datasets/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Available datasets
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../config-files/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config files
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../extract-text-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Extract text data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../add-your-own-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Adding your own data
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../compose-train-validation-data/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Compose training and validation dataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+        
+      
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    Related work
+  </span>
+  
+
+      </a>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
+        
+          
+          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    API reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_10">
+            <span class="md-nav__icon md-icon"></span>
+            API reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/base_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    BaseDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/hf_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    HFDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/jsonl_dataset/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    JSONLDataset
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../api/config/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Config
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="related-work">Related Work</h1>
+<p>This framework is heavily inspired by other open source projects:</p>
+<ul>
+<li><a href="https://github.com/huggingface/datasets">Huggingface Datasets</a></li>
+<li><a href="https://www.tensorflow.org/datasets">TensorFlow Datasets</a></li>
+<li><a href="https://github.com/NVIDIA/NeMo-Curator">NVIDIA's NeMo-Curator</a></li>
+</ul>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "..", "features": [], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/search/search_index.json b/search/search_index.json
new file mode 100644
index 0000000..c814fec
--- /dev/null
+++ b/search/search_index.json
@@ -0,0 +1 @@
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"llm-datasets","text":"<p>llm-datasets is a collection of datasets for language model training including scripts for downloading, preprocesssing, and sampling.</p> <ul> <li>Getting started</li> <li>Config files</li> <li>Add your own data</li> </ul>"},{"location":"add-your-own-data/","title":"Integrate a custom dataset","text":""},{"location":"add-your-own-data/#write-a-dataset-class","title":"Write a dataset class","text":"<p>The first step for adding a new dataset is write a new dataset class. If your data comes from a common source such as Huggingface, you can build upon existing abstractions.</p>"},{"location":"add-your-own-data/#huggingface-dataset","title":"Huggingface dataset","text":"<p>For example, Huggingface datasets only needed to specify some metadata like dataset ID, title etc. and the column where the textual data can be extracted from (by default <code>text</code> column):</p> <pre><code># my_datasets/pg19.py\n\nfrom llm_datasets.datasets.hf_dataset import HFDataset\nfrom llm_datasets.datasets.base import License, Availability\n\nclass PG19Dataset(HFDataset):\n    DATASET_ID = \"pg19\"\n    TITLE = \"Project Gutenberg books published before 1919\"\n    HOMEPAGE = \"https://huggingface.co/datasets/pg19\"\n    LICENSE = License(\"Apache License Version 2.0 (or public domain?)\", url=\"https://www.apache.org/licenses/LICENSE-2.0.html\")\n    CITATION = r\"\"\"@article{raecompressive2019,\n        author = {Rae, Jack W and Potapenko, Anna and Jayakumar, Siddhant M and\n                    Hillier, Chloe and Lillicrap, Timothy P},\n        title = {Compressive Transformers for Long-Range Sequence Modelling},\n        journal = {arXiv preprint},\n        url = {https://arxiv.org/abs/1911.05507},\n        year = {2019},\n        }\n        \"\"\"  # noqa\n    AVAILIBILITY = Availability.DIRECT_DOWNLOAD\n\n    HF_DATASET_ID = \"pg19\"\n    HF_DATASET_SPLIT = \"train\"\n    streaming = True\n    text_column_name = \"text\"\n    title_column_name = \"short_book_title\"\n</code></pre>"},{"location":"add-your-own-data/#csv-dataset","title":"CSV dataset","text":"<p>Other datasets may require implementing the full text extraction logic. The example below reads text data from CSV files while excluding specific subsets:</p> <pre><code># my_datasets/csv_example.py\n\nimport logging\nimport pandas as pd\nfrom pathlib import Path\nfrom llm_datasets.datasets.base import BaseDataset, Availability, License\n\nlogger = logging.getLogger(__name__)\n\n\nclass CSVExampleDataset(BaseDataset):\n    DATASET_ID = \"csv_example\"\n    TITLE = \"An example for a dataset from CSV files\"\n    AVAILIBITY = Availability.ON_REQUEST\n    LANGUAGES = [\"en\"]\n    LICENSE = License(\"mixed\")\n\n    def get_texts(self):\n        \"\"\"\n        Extract texts from CSV files (format: \"documen_id,text,score,url\")\n        \"\"\"\n        # Iterate over CSV files in raw dataset directory\n        for file_path in self.get_dataset_file_paths(needed_suffix=\".csv\"):\n            file_name = Path(file_path).name\n\n            if (\n                file_name.startswith(\"mc4_\")\n                or file_name.startswith(\"colossal-oscar-\")\n                or file_name.startswith(\"wikimedia\")\n            ):\n                # skip subsets that overlap with other datasets (baes on file name)\n                continue\n\n            logger.info(\"Reading CSV: %s\", file_path)\n            try:\n                # Use chunks to reduce memory consumption\n                for df in pd.read_csv(file_path, sep=\",\", chunksize=10_000):\n                    for text in df.text.values:\n                        # Pass extracted text\n                        yield text\n            except ValueError as e:\n                logger.error(\"Error in file %s; error = %s\", file_path, e)\n</code></pre>"},{"location":"add-your-own-data/#register-new-dataset-classes","title":"Register new dataset classes","text":"<p>Each dataset class needs to be registered with <code>llm-datasets</code> such that the commands know what classes are available. This can be done by making a new Python module with a <code>get_registered_dataset_classes</code> method that returns a list of dataset classes:</p> <pre><code># my_datasets/dataset_registry.py\nfrom my_datasets.pg19 import PG19Dataset\n\ndef get_registered_dataset_classes():\n    return [\n        PG19Dataset,\n    ]\n</code></pre>"},{"location":"add-your-own-data/#load-registry-in-commands","title":"Load registry in commands","text":"<p>To load the registerd datasets in the pipeline commands, you need to specify the <code>--extra_dataset_registries</code> argument:</p> <pre><code>llm-datasets compose ... -extra_dataset_registries=my_datasets.dataset_registry\n</code></pre>"},{"location":"compose-train-validation-data/","title":"Compose dataset","text":"<p>The pipeline step that produces the final training or validation set is the <code>compose</code> step. Before you run this command, you should specify in the config files what datasets should be selected and how they should be sampled.</p> <pre><code>llm-datasets compose \u2013-split=train \u2013-configs=my_dataset.yaml \\\n    --text_data_dir=/data/my_text_data \\\n    --composed_data_dir=/data/my_composed_data/train/\n</code></pre> <p>Depending on the your system (especially IO-speed) and dataset size this step can take a substantial amount of time (&gt; 24 hours for a 1T token dataset).</p>"},{"location":"config-files/","title":"Config Files","text":"<p><code>llm-datasets</code> allows you to specific general settings through config files so you do not need to specific always the same command line arguments. Several commands support passing the <code>--configs</code> argument which should point to one or more YAML-files on your file system. For example, the text extraction command:</p> <pre><code>llm-datasets extract_text ... --configs $PATH_TO_YAML_CONFIG_FILE\n</code></pre>"},{"location":"config-files/#specifing-local-paths","title":"Specifing local paths","text":"<p>In the config files, you can store for example system specific settings like the local paths, where the raw dataset files are located:</p> <pre><code># ./examples/llm_datasets_configs/my_system.yaml\nlocal_dirs_by_source_id:\n  redpajama: /my_system_specific_data_directory/redpajama\n</code></pre> <p>The RedPajama dataset requires the manual download prior to the text extraction. With the above config, we tell the extraction command the path where we downloaded the RedPajama data by providing the config file:</p> <pre><code>llm-datasets extract_text redpajama_book --configs ./examples/llm_datasets_configs/my_system.yaml\n</code></pre>"},{"location":"config-files/#dataset-selection-and-sampling","title":"Dataset selection and sampling","text":"<p>The configuration files are also needed for specifying the final dataset composition, including the selection of the datasets and their sampling. The following examples shows a config for an Italian dataset:</p> <pre><code># ./examples/llm_datasets_configs/italian_data.yaml\n\n# a fixed random seed for shuffling etc.\nseed: 0\n\nselected_dataset_ids:\n  # italian subsets\n  - itwac\n  - eurlex_it\n  - wikipedia_20231101_it\n  - wikibooks_it\n  - wikinews_it\n  - colossal_oscar_2023-23_it\n  - parlamint_it\n\n# down-sample webcrawled + up-sampled high quality\nsampling_factor_by_source_id:\n  colossal_oscar: 0.1\n\nsampling_factor_by_dataset_id:\n  itwac: 0.5\n  eurlex_it: 2\n  wikipedia_20231101_it: 3\n</code></pre> <p>To use this config, provide the path in the <code>--configs</code> argument:</p> <pre><code># compose final dataset\nllm-datasets compose ... --configs ./examples/llm_datasets_configs/italian_data.yaml\n</code></pre>"},{"location":"extract-text-data/","title":"Extract text data","text":""},{"location":"extract-text-data/#download-and-text-extraction","title":"Download and text extraction","text":"<p>To download and extract the plain-text of one or more datasets, run the following command:</p> <pre><code>llm-datasets extract_text $DATASET_ID $OUTPUT_DIR\n</code></pre> <p>By default, output is saved as JSONL files. To change the output format, you can use the <code>--output_format</code> argument as below:</p> <pre><code>llm-datasets extract_text $DATASET_ID $OUTPUT_DIR --output_format parquet  --output_compression zstd\n</code></pre>"},{"location":"getting-started/","title":"Getting Started","text":""},{"location":"getting-started/#installation","title":"Installation","text":"<p>Install the <code>llm-datasets</code> package with pip:</p> <pre><code>pip install llm-datasets\n</code></pre> <p>In order to keep the package minimal by default, <code>llm-datasets</code> comes with optional dependencies useful for some use cases. For example, if you want to have the text extraction for all available datasets, run:</p> <pre><code>pip install llm-datasets[datasets]\n</code></pre>"},{"location":"getting-started/#quick-start","title":"Quick start","text":""},{"location":"getting-started/#download-and-text-extraction","title":"Download and text extraction","text":"<p>To download and extract the plain-text of one or more datasets, run the following command:</p> <pre><code>llm-datasets extract_text $DATASET_ID $OUTPUT_DIR\n</code></pre> <p>By default, output is saved as JSONL files. To change the output format, you can use the <code>--output_format</code> argument as below:</p> <pre><code>llm-datasets extract_text $DATASET_ID $OUTPUT_DIR --output_format parquet  --output_compression zstd\n</code></pre>"},{"location":"getting-started/#available-datasets","title":"Available datasets","text":"<p>A list or table with all available datasets can be print with the follow command:</p> <pre><code>llm-datasets print_stats --print_output md\n</code></pre>"},{"location":"getting-started/#pipeline-commands","title":"Pipeline commands","text":"<pre><code>usage: llm-datasets &lt;command&gt; [&lt;args&gt;]\n\npositional arguments:\n  {chunkify,collect_metrics,compose,convert_parquet_to_jsonl,extract_text,hf_upload,print_stats,shuffle,train_tokenizer}\n                        llm-datasets command helpers\n    chunkify            Split the individual datasets into equally-sized file chunks (based on bytes or rows)\n    collect_metrics     Collect metrics (token count etc.) from extracted texts\n    compose             Compose the final train/validation set based on the individual datasets\n    convert_parquet_to_jsonl\n                        Convert Parquet files to JSONL\n    extract_text        Extract text from raw datasets\n    hf_upload           Upload files or directories to Huggingface Hub.\n    print_stats         Print dataset statistics as CSV, Markdown, ...\n    shuffle             Shuffle the individual datasets on the file-chunk level (no global shuffle!)\n    train_tokenizer     Train a tokenizer (only: sentencepiece supproted)\n\noptions:\n  -h, --help            show this help message and exit\n</code></pre>"},{"location":"overview/","title":"Framework Overview","text":""},{"location":"overview/#data-schema","title":"Data schema","text":""},{"location":"overview/#pipeline","title":"Pipeline","text":""},{"location":"related-work/","title":"Related Work","text":"<p>This framework is heavily inspired by other open source projects:</p> <ul> <li>Huggingface Datasets</li> <li>TensorFlow Datasets</li> <li>NVIDIA's NeMo-Curator</li> </ul>"},{"location":"api/base_dataset/","title":"BaseDataset","text":"<p>               Bases: <code>object</code></p> <p>Base class for all datasets. It implements all generic loading, processing, and writing methods.</p> Source code in <code>src/llm_datasets/datasets/base.py</code> <pre><code>class BaseDataset(object):\n    \"\"\"Base class for all datasets. It implements all generic loading, processing, and writing methods.\"\"\"\n\n    DATASET_ID = None\n    SOURCE_ID = None\n\n    TITLE = None\n    DESCRIPTION: str = \"\"\n    HOMEPAGE: Optional[str] = None\n    AVAILIBILITY: Availability = None\n    DOWNLOAD_URLS: List[Union[str, Tuple[str]]] = []\n    LOCAL_DIRS = []\n    VERSION = None\n    DOI = None\n    CITATION = None\n\n    LICENSE: Optional[Union[str, License]] = None\n    PII = None\n\n    LANGUAGES = []\n\n    SUPERVISED = False\n    TRANSLATIONS = False\n    WEB_CRAWLED = False\n    QUALITY_WARNINGS: List[QualityWarning] = []\n    GENRES: List[Genre] = []\n    HAS_OVERLAP_WITH: List[Union[Type, str]] = []\n    USED_BY = None\n    DUMMY = False\n    SINGLE_OUTPUT_FILE = True\n    HAS_PREDEFINED_VALIDATION_SET = False\n\n    # Statistics\n    TOKENS = None\n    BYTES = None\n\n    counter = Counter()\n\n    def __init__(\n        self,\n        text_datasets_dir: Optional[str] = None,\n        raw_datasets_dir: Optional[str] = None,\n        workers: int = 1,\n        output_text_field: str = \"text\",\n        override_output: bool = False,\n        limit: int = 0,\n        skip_items: int = 0,\n        hf_auth_token: str = None,\n        print_write_progress: int = 10_000,\n        min_length: Optional[int] = None,\n        json_ensure_ascii: bool = False,\n        title_delimiter: str = \":\\n\\n\",\n        paragraph_delimiter: str = \"\\n\\n\",\n        sentence_delimiter: str = \" \",\n        output_format: Literal[\"jsonl\", \"parquet\"] = \"jsonl\",\n        output_compression: Optional[\n            str\n        ] = None,  # jsonl: gzip, parquet: \u2018NONE\u2019, \u2018SNAPPY\u2019, \u2018GZIP\u2019, \u2018BROTLI\u2019, \u2018LZ4\u2019, \u2018ZSTD\u2019\n        output_batch_size: int = 1000,\n        shuffled_datasets_dir: Optional[str] = None,\n        max_output_chunk_uncompressed_bytes: Optional[int] = None,\n        max_output_chunk_rows: Optional[int] = None,\n        config: Union[Config, dict] = None,\n        **kwargs,\n    ) -&gt; None:\n        self.text_datasets_dir = text_datasets_dir\n        self.raw_datasets_dir = raw_datasets_dir\n        self.workers = workers\n        self.output_text_field = output_text_field\n        self.override_output = override_output\n        self.limit = limit\n        self.skip_items = skip_items\n        self.hf_auth_token = hf_auth_token\n        self.print_write_progress = print_write_progress\n        self.min_length = min_length if min_length is not None else DEFAULT_MIN_TEXT_LENGTH\n        self.json_ensure_ascii = json_ensure_ascii\n        self.title_delimiter = title_delimiter\n        self.paragraph_delimiter = paragraph_delimiter\n        self.sentence_delimiter = sentence_delimiter\n        self.output_format = output_format\n        self.output_compression = output_compression\n        self.output_batch_size = output_batch_size\n        self.shuffled_datasets_dir = shuffled_datasets_dir\n        self.max_output_chunk_uncompressed_bytes = max_output_chunk_uncompressed_bytes\n        self.max_output_chunk_rows = max_output_chunk_rows\n\n        # Timer for statistics\n        self.start_time = datetime.datetime.now()\n        self.end_time = None\n\n        # Generate config from dict\n        if isinstance(config, dict):\n            config = Config(**config)\n\n        self.config = config\n\n        # Set kwargs\n        for k, v in kwargs.items():\n            if hasattr(self, k):\n                setattr(self, k, v)\n            else:\n                logger.warning(\"kwarg provided but not attribute of dataset class: %s\", k)\n\n    def get_source_id(self):\n        if self.SOURCE_ID:\n            return self.SOURCE_ID\n        else:\n            return self.DATASET_ID\n\n    def get_language_code(self, unknown: str = \"unknown\", mixed: str = \"mixed\"):\n        if len(self.LANGUAGES) == 1:\n            lang = self.LANGUAGES[0]\n        elif len(self.LANGUAGES) == 0:\n            lang = unknown\n        else:\n            lang = mixed\n\n        return lang\n\n    def get_output_text_field(self):\n        return self.output_text_field\n\n    def has_output_files(self, min_file_size: int = 1, shuffled=False) -&gt; bool:\n        return self.has_single_output_file(\n            min_file_size=min_file_size, shuffled=shuffled\n        ) or self.has_chunked_output_files(min_file_size=min_file_size, shuffled=shuffled)\n\n    def has_single_output_file(self, min_file_size: int = 1, shuffled=False) -&gt; bool:\n        fp = self.get_single_output_file_path(shuffled=shuffled)\n\n        return fp is not None and os.path.exists(fp) and os.stat(fp).st_size &gt;= min_file_size\n\n    def has_chunked_output_files(self, min_file_size: int = 1, shuffled=False) -&gt; bool:\n        for fp in self.get_chunked_output_file_paths(shuffled=shuffled):\n            if os.path.exists(fp) and os.stat(fp).st_size &gt;= min_file_size:\n                return True\n            break\n\n        return False\n\n    def get_output_file_paths(self, single=False, chunked=False, shuffled=False) -&gt; List[str]:\n        if single:\n            return [self.get_single_output_file_path(shuffled=shuffled)]\n        elif chunked:\n            return self.get_chunked_output_file_paths(shuffled=shuffled)\n        else:\n            # auto determine based on existing files\n            if self.has_chunked_output_files(shuffled=shuffled):\n                return self.get_chunked_output_file_paths(shuffled=shuffled)\n            else:\n                return [self.get_single_output_file_path(shuffled=shuffled)]\n\n    def get_output_file_path(self):\n        raise NotImplementedError(\"Use `get_output_file_paths` instead!\")\n\n    def get_output_extension(self, with_dot: bool = True, shuffled: bool = False) -&gt; str:\n        extension = \".\" if with_dot else \"\"\n\n        if shuffled:\n            extension += \"shuffled.\"\n\n        extension += self.output_format\n\n        if self.output_format == \"jsonl\" and self.output_compression == \"gzip\":\n            # Simply add \".gz\" as extension as smart_open will take about the compression\n            extension += \".gz\"\n\n        return extension\n\n    def get_output_dir(self, shuffled=False):\n        if shuffled:\n            if self.shuffled_datasets_dir:\n                return self.shuffled_datasets_dir\n            raise ValueError(\"shuffled_datasets_dir is not set\")\n        else:\n            return self.text_datasets_dir\n\n    def get_single_output_file_path(self, shuffled=False) -&gt; str:\n        return os.path.join(\n            self.get_output_dir(shuffled=shuffled), self.DATASET_ID + self.get_output_extension(shuffled=shuffled)\n        )\n\n    def get_chunked_output_file_paths(self, shuffled=False) -&gt; List[str]:\n        output_dir_path = Path(self.get_output_dir(shuffled=shuffled))\n\n        return list(\n            output_dir_path.glob(f\"{self.DATASET_ID}.part-*-of-*{self.get_output_extension(shuffled=shuffled)}\")\n        )\n\n    def get_chunked_output_file_path(self, part: int, total_parts: Optional[int] = None, shuffled=False) -&gt; str:\n        if total_parts is None:\n            fn = f\"{self.DATASET_ID}.part-{part:04d}{self.get_output_extension(shuffled=shuffled)}\"\n        else:\n            fn = f\"{self.DATASET_ID}.part-{part:04d}-of-{total_parts:04d}{self.get_output_extension(shuffled=shuffled)}\"\n\n        return os.path.join(self.get_output_dir(shuffled=shuffled), fn)\n\n    def get_single_or_chunked_output_file_path(\n        self, part: Optional[int] = None, total_parts: Optional[int] = None, shuffled=False\n    ) -&gt; str:\n        if part is None:\n            return self.get_single_output_file_path(shuffled=shuffled)\n        else:\n            return self.get_chunked_output_file_path(part, total_parts, shuffled=shuffled)\n\n    def filter_texts_or_documents(self, texts_or_documents: Iterable[Union[str, Document]]):\n        if self.config.use_documents:\n            return self.filter_documents(texts_or_documents)\n        else:\n            return self.filter_texts(texts_or_documents)\n\n    def filter_documents(self, documents: Iterable[Document]):\n        \"\"\"Applies basic filtering on the texts before saving\"\"\"\n        for doc in documents:\n            if self.min_length &gt; 0 and len(doc.text) &lt; self.min_length:\n                # skip because of short text length\n                self.counter.update({\"filtered_short_text\": 1})\n                continue\n\n            yield doc\n\n    def filter_texts(self, texts: Iterable[str]):\n        \"\"\"Applies basic filtering on the texts before saving\"\"\"\n        for text in texts:\n            if self.min_length &gt; 0 and len(text) &lt; self.min_length:\n                # skip because of short text length\n                self.counter.update({\"filtered_short_text\": 1})\n                continue\n\n            yield text\n\n    def remove_texts(self):\n        for fp in self.get_output_file_paths():\n            logger.warning(f\"Removing {fp}\")\n            os.remove(fp)\n\n    def save_texts(self, texts: Iterable[str], append: bool = False):\n        \"\"\"Save texts in different formats\"\"\"\n        if self.has_output_files() and not self.override_output:\n            raise FileExistsError(f\"Output exists already (override not enabled): {self.get_output_file_paths()}\")\n\n        if self.output_format == \"jsonl\":\n            docs_count = self.save_texts_to_jsonl(texts, append=append)\n\n        elif self.output_format == \"parquet\":\n            if append:\n                raise NotImplementedError(\"Appending is not supported by parquet output format\")\n\n            docs_count, saved_chunks = self.save_texts_to_parquet(texts)\n\n            self.counter.update({\"saved_chunks\": saved_chunks})\n        else:\n            raise ValueError(f\"Unsupported output format: {self.output_format}\")\n\n        logger.info(f\"Documents saved: {docs_count:,}\")\n\n        self.counter.update({\"docs_count\": docs_count})\n\n        if docs_count == 0:\n            logger.warning(\"No documents have been saved!\")\n\n            # delete empty output file\n            if self.has_output_files():\n                self.remove_texts()\n\n        return docs_count\n\n    def save_texts_to_parquet(self, texts: Iterable[str], file_path: Optional[str] = None, apply_filter: bool = True):\n        \"\"\"Save text in parquet (single column schema, in batches)\"\"\"\n        assert self.output_format == \"parquet\"\n\n        if file_path is None:\n            file_path = self.get_output_file_paths(single=True)[0]\n\n        if apply_filter:\n            texts = self.filter_texts_or_documents(texts)\n\n        if self.config.use_documents:\n            # document schema\n            schema = self.get_document_schema().get_pa_schema()\n        else:\n            # text-only schema\n            schema = pa.schema(\n                [\n                    (self.get_output_text_field(), pa.string()),\n                ]\n            )\n\n        # Max. chunk size is multiplied with this factor\n        # (to account for inaccurate chunk sizes due to batching)\n        safety_factor = 0.975\n\n        # Save as Parquet file\n        logger.info(f\"Writing parquet output ({self.output_batch_size=}; {self.limit=}; {self.output_compression=})\")\n\n        saved_docs, saved_chunks = save_texts_to_parquet_chunks(\n            texts=texts,\n            schema=schema,\n            max_chunk_uncompressed_bytes=(\n                self.max_output_chunk_uncompressed_bytes * safety_factor\n                if self.max_output_chunk_uncompressed_bytes is not None\n                else None\n            ),\n            max_chunk_rows=self.max_output_chunk_rows,\n            output_path_func=self.get_single_or_chunked_output_file_path,\n            compression=get_parquet_compression(self.output_compression),\n            batch_size=self.output_batch_size,\n            print_write_progress=self.print_write_progress,\n            limit=self.limit,\n        )\n\n        if hasattr(texts, \"terminate\"):\n            logger.info(\"Killing all remaining workers, if any (iterator end)\")\n            texts.terminate()\n\n        return saved_docs, saved_chunks\n\n    def save_texts_to_jsonl(self, texts: Iterable[str], append: bool = False):\n        \"\"\"Write JSONL files to &lt;output_dir&gt;/&lt;DATASET_ID&gt;.jsonl\n        (each line is a JSON object with \"doc\" field and text as plain text)\n        \"\"\"\n        mode = \"a\" if append else \"w\"\n        fp = self.get_output_file_paths(single=True)[0]\n\n        # Save as JSONL\n        logger.info(f\"Writing JSONL output to {fp} ({mode=})\")\n\n        docs_count = 0\n\n        with smart_open(fp, mode) as f:\n            for docs_count, text in enumerate(self.filter_texts(texts), 1):\n                f.write(json.dumps({self.get_output_text_field(): text}, ensure_ascii=self.json_ensure_ascii) + \"\\n\")\n\n                if docs_count &gt; 0 and (docs_count % self.print_write_progress) == 0:\n                    logger.info(f\"Written {docs_count:,} docs ...\")\n\n                if self.limit &gt; 0 and docs_count &gt;= self.limit:\n                    logger.warning(f\"Limit reached ({docs_count:,} docs)\")\n\n                    if hasattr(texts, \"terminate\"):\n                        logger.info(\"Killing all remaining workers, if any\")\n                        texts.terminate()\n                    break\n\n        if hasattr(texts, \"terminate\"):\n            logger.info(\"Killing all remaining workers, if any (iterator end)\")\n            texts.terminate()\n\n        return docs_count\n\n    def get_hf_auth_token(self):\n        if self.hf_auth_token:\n            return self.hf_auth_token\n        else:\n            env_token = os.environ.get(\"HF_PASSWORD\")\n\n            if env_token:\n                logger.info(\"Using HF auth token from env var\")\n                return env_token\n\n        return None\n\n    def get_local_dataset_dir(self):\n        if self.config:\n            if self.DATASET_ID in self.config.local_dirs_by_dataset_id:\n                return self.config.local_dirs_by_dataset_id[self.DATASET_ID]\n\n            if self.get_source_id() in self.config.local_dirs_by_source_id:\n                return self.config.local_dirs_by_source_id[self.get_source_id()]\n\n        if self.LOCAL_DIRS:  # TODO deprecated -&gt; use config instead!\n            # manually defined dataset directory\n            return get_path_by_system(self.LOCAL_DIRS)\n        elif self.raw_datasets_dir:\n            # automatically based on language + dataset_id\n            return os.path.join(self.raw_datasets_dir, self.get_language_code(), self.DATASET_ID)\n        else:\n            raise ValueError(\"Either `LOCAL_DIRS` or `raw_datasets_dir` must be defined.\")\n\n    def get_dataset_file_paths(\n        self,\n        dataset_dir: Optional[str] = None,\n        single_file: bool = False,\n        subdirectories: bool = False,\n        needed_suffix: Optional[Union[str, Tuple[str]]] = None,\n        return_none_if_not_dir_exists: bool = False,\n    ):\n        if dataset_dir is None:\n            dataset_dir = self.get_local_dataset_dir()\n\n        if not os.path.exists(dataset_dir):\n            logger.warning(f\"Download directory does not exist: {dataset_dir}\")\n\n            if return_none_if_not_dir_exists:\n                return None\n            else:\n                return []\n\n        if subdirectories:\n            # find files in all subdirectories\n            logger.info(f\"Finding dataset files in all subdirectories: {dataset_dir}\")\n            fps = [os.path.join(path, name) for path, subdirs, files in os.walk(dataset_dir) for name in files]\n\n        else:\n            # root-level files\n            fps = [os.path.join(dataset_dir, f) for f in os.listdir(dataset_dir)]\n\n        # filter by suffix\n        fps = [f for f in fps if needed_suffix is None or f.endswith(needed_suffix)]\n\n        # filter by file type\n        fps = [fp for fp in fps if os.path.isfile(fp)]\n\n        if single_file:\n            if len(fps) &gt; 1:\n                raise FileExistsError(f\"Multiple files in download directory but only a single one was expected: {fps}\")\n            elif len(fps) == 0:\n                raise FileNotFoundError(f\"No file found but a single one was expected: {fps}\")\n\n            return fps[0]\n\n        return fps\n\n    def decompress(self):\n        raise NotImplementedError\n\n    def is_dummy(self):\n        return self.DUMMY\n\n    def is_downloaded(self):\n        return False\n\n    def download(self):\n        # Download all DOWNLOAD_URLS into local dataset dir\n        output_dir = self.get_local_dataset_dir()\n\n        logger.info(f\"Downloading {len(self.DOWNLOAD_URLS)} files to {output_dir}\")\n\n        if not os.path.exists(output_dir):\n            logger.info(f\"Creating download dir: {output_dir}\")\n            os.makedirs(output_dir)\n\n        for source_url in self.DOWNLOAD_URLS:\n            if isinstance(source_url, tuple):\n                source_url, target_filename = source_url\n                output_filepath = os.path.join(output_dir, target_filename)\n\n                if os.path.exists(output_filepath):\n                    logger.warning(f\"Output exists already: {output_filepath}\")\n                    continue\n            else:\n                output_filepath = output_dir  # auto file name\n\n            try:\n                logger.info(f\"Download URL: {source_url}\")\n                logger.info(f\"Output path: {output_filepath}\")\n\n                out_filename = wget.download(source_url, out=output_filepath)\n                logger.info(f\"Completed {out_filename}\")\n            except HTTPError as e:\n                logger.error(f\"Error {e}\")\n\n    def get_tokens(self):\n        if self.TOKENS:\n            return self.TOKENS\n        elif self.get_bytes():\n            # Estimate tokens based on bytes\n            return int(self.get_bytes() * TOKENS_PER_BYTE)\n        else:\n            return None\n\n    def get_bytes(self):\n        return self.BYTES\n\n    def get_texts_from_conllu_file(self, file_handler: TextIO):\n        import conllu\n\n        text = None\n\n        # try:\n        for sentence in conllu.parse_incr(file_handler):\n            if \"newdoc id\" in sentence.metadata:\n                if text is not None:\n                    # doc completed\n                    yield text\n                text = \"\"  # init empty document\n\n            # append text to doc\n            if \"text\" in sentence.metadata:\n                if not text:\n                    text = \"\"  # some conllu are not using doc ids -&gt; force init\n                else:\n                    text += \" \"  # whitespace betweeen sentences\n\n                text += sentence.metadata[\"text\"]\n\n            if \"title\" in sentence.metadata:\n                text += self.title_delimiter\n\n        # yield last document\n        if text:\n            yield text\n\n        # except ParseException as e:\n        #     # TODO\n        #     logger.error(e)\n\n    def get_texts(self) -&gt; Iterable[str]:\n        raise NotImplementedError\n\n    def extract_plaintext(self) -&gt; int:\n        saved_texts_count = self.save_texts(self.get_texts())\n\n        if self.counter:\n            logger.info(f\"Statistics {self.counter}\")\n\n        if self.config.save_stats:\n            self.save_stats()\n\n        return saved_texts_count\n\n    def get_output_rows_count(self, shuffled: bool = False) -&gt; int:\n        \"\"\"Read metadata from parquet files and extract number of rows\"\"\"\n        if self.output_format == \"parquet\":\n            output_paths = list(self.get_output_file_paths(shuffled=shuffled))\n\n            # Filter for existing\n            output_paths = [output_path for output_path in output_paths if os.path.exists(output_path)]\n\n            if output_paths:\n                rows_count = 0\n\n                for output_path in output_paths:\n                    with open(output_path, \"rb\") as f:\n                        parquet_file = pq.ParquetFile(\n                            f,\n                            # increased to avoid OSErrors\n                            thrift_string_size_limit=1000000000,  # default: 100000000\n                            thrift_container_size_limit=10000000,  # default: 1000000\n                        )\n                        rows_count += parquet_file.metadata.num_rows\n\n                        logger.debug(\"Rows = %s in %s\", rows_count, output_path)\n\n                return rows_count\n\n            logger.debug(\"No output files exists for %s\", self.DATASET_ID)\n            return -1\n        else:\n            raise ValueError(f\"Cannot determine the output rows count with {self.output_format=}\")\n\n    def get_compression_from_output_files(self, shuffled=False):\n        \"\"\"NOTE: Currently only implemented for `parquet` format.\"\"\"\n        if self.output_format == \"parquet\":\n            for output_path in self.get_output_file_paths(shuffled=shuffled):\n                if os.path.exists(output_path):\n                    with open(output_path, \"rb\") as f:\n                        parquet_file = pq.ParquetFile(\n                            f,\n                            # increased to avoid OSErrors\n                            thrift_string_size_limit=1000000000,  # default: 100000000\n                            thrift_container_size_limit=10000000,  # default: 1000000\n                        )\n                        parquet_metadata = parquet_file.metadata\n                        for i in range(parquet_metadata.num_row_groups):\n                            for j in range(parquet_metadata.num_columns):\n                                return parquet_file.metadata.row_group(i).column(j).compression\n\n        return \"unknown\"\n\n    def generate_texts_from_output(\n        self,\n        shuffled: bool = False,\n        batch_size: Optional[int] = None,\n        limit: int = 0,\n        offset: int = 0,\n        shuffle_output_file_paths: bool = False,\n        reader_implementation: Literal[\"polars_read_parquet\", \"pyarrow\"] = \"pyarrow\",\n        cast_to_py_string: bool = False,\n    ) -&gt; Iterable[Union[str, pa.StringScalar]]:\n        \"\"\"A iterator over texts from processed output files.\"\"\"\n        if batch_size is None:\n            batch_size = self.output_batch_size\n\n        if self.output_format != \"parquet\":\n            raise ValueError(f\"Cannot generate texts with {self.output_format=}\")\n\n        # Check if output files exists and sort them\n        output_paths = [\n            file_path\n            for file_path in sorted(self.get_output_file_paths(shuffled=shuffled))\n            if os.path.exists(file_path)\n        ]\n\n        # Count generated rows\n        rows = 0\n        rows_limit = limit - offset\n\n        # if limit &gt; 0:\n        #     batch_size = min(batch_size, limit)\n\n        # Shuffle output chunks:\n        # This changes the order in that the chunks are read ensure also shuffling on the full dataset level.\n        if shuffle_output_file_paths:\n            random.seed(self.config.seed)  # reset seed to avoid inference by other scripts\n            random.shuffle(output_paths)\n\n        chunk_start = 0\n        chunk_end = None\n\n        if output_paths:\n            for file_path in output_paths:\n                logger.info(\"Generating text from %s\", file_path)\n\n                # PyArrow implementation\n                with open(file_path, \"rb\") as file_handler:\n                    pq_file = pq.ParquetFile(\n                        file_handler,\n                        # memory_map=False,\n                    )\n                    file_rows_count = pq_file.metadata.num_rows\n\n                    chunk_end = chunk_start + file_rows_count - 1\n\n                    # Should we read from the current chunk?\n                    # Yes, if\n                    # - offset is smaller or equal chunk_start\n                    # (- limit is greater or equal chunk_end) --- limit does not matter\n\n                    # variants\n                    # A) requested rows start in chunk and ends in chunk\n                    # B) requested rows start in chunk but ends in following chunk\n                    # C) requested rows start before chunk and ends in chunk\n                    # D) requested rows start before chunk and ends in following chunk\n\n                    if (\n                        chunk_start &lt;= offset &lt; chunk_end\n                        or offset &lt; chunk_start\n                        and (limit == 0 or chunk_start &lt; limit)\n                    ):\n                        file_offset = max(\n                            0, offset - chunk_start\n                        )  # global offset minus start of current file (current chunk)\n                        file_limit = (\n                            max(0, limit - chunk_start) if limit &gt; 0 else 0  # limit - chunk_start\n                        )  # Length of the slice: global limit minus start of current chunk\n                        # TODO before: limit - chunk_start - file_offset\n\n                        logger.debug(\n                            \"Reading file chunk from %s: file [%s - %s]; global [%s - %s]; chunk [%s - %s]\",\n                            file_path,\n                            file_offset,\n                            file_limit,\n                            offset,\n                            limit,\n                            chunk_start,\n                            chunk_end,\n                        )\n                        if reader_implementation == \"pyarrow\":\n                            # PyArrow implementation with iter_batches\n                            # with open(file_path, \"rb\") as file_handler:\n                            #     parquet_file = pq.ParquetFile(file_handler)\n\n                            for batch_idx, pq_batch in enumerate(\n                                pq_file.iter_batches(\n                                    columns=[self.get_output_text_field()], batch_size=batch_size, use_threads=False\n                                )\n                            ):\n                                for row_idx, text_column in enumerate(pq_batch.columns[0], batch_idx * batch_size):\n                                    if row_idx &gt;= file_offset:\n                                        if rows_limit &gt; 0 and rows &gt;= rows_limit:\n                                            # break row loop\n                                            logger.debug(\"break row loop\")\n                                            break\n\n                                        text: pa.StringScalar = text_column\n\n                                        if cast_to_py_string:\n                                            # cast to string\n                                            text = text_column.as_py()\n\n                                        yield text\n                                        rows += 1\n\n                                if rows_limit &gt; 0 and rows &gt;= rows_limit:\n                                    # break batch loop\n                                    logger.debug(\"break batch loop\")\n                                    break\n\n                            # PyArrow implementation with read_row_group\n                            # with open(file_path, \"rb\") as file_handler:\n                            #     parquet_file = pq.ParquetFile(file_handler)\n\n                            #     # 1. What row groups need to be read?\n                            #     row_groups, group_idx_to_offset_last_row = get_selected_row_groups(\n                            #         parquet_file, file_offset, file_limit\n                            #     )\n                            #     logger.debug(\"Selected row groups: %s; %s\", row_groups, group_idx_to_offset_last_row)\n\n                            #     # 2. Read selected row groups\n                            #     for selected_row_group in row_groups:\n                            #         logger.debug(\"Read row group: %s\", selected_row_group)\n                            #         group_table = parquet_file.read_row_group(\n                            #             selected_row_group, columns=[self.get_output_text_field()]\n                            #         )\n\n                            #         # What offsets and limit? (only if needed)\n                            #         if group_idx_to_offset_last_row is not None:\n                            #             group_offset, _ = group_idx_to_offset_last_row[selected_row_group]\n\n                            #             row_offset = max(0, file_offset - group_offset)\n                            #             logger.debug(\"Row group: %s; row offset: %s\", selected_row_group, row_offset)\n\n                            #         # Iterate over rows\n                            #         for row_idx, text_column in enumerate(group_table.columns[0]):\n                            #             # Skip rows before offset\n                            #             if group_idx_to_offset_last_row is None or row_idx &gt;= row_offset:\n                            #                 if rows_limit &gt; 0 and rows &gt;= rows_limit:\n                            #                     # break row loop\n                            #                     logger.debug(\"break row loop\")\n                            #                     break\n\n                            #                 text = text_column.as_py()  # cast to str\n                            #                 yield text\n                            #                 rows += 1\n\n                            #         if rows_limit &gt; 0 and rows &gt;= rows_limit:\n                            #             # break row group loop\n                            #             logger.debug(\"break row group loop\")\n                            #             break\n\n                        elif reader_implementation == \"polars_read_parquet\":\n                            # Polars \"scan_parquet\" implementation: Error \"Segmentation fault (core dumped)\"\n                            # df = (\n                            #     pl.scan_parquet(file_path, low_memory=True).collect(\n                            #     streaming=True\n                            # ).slice(offset=file_offset, length=file_limit if file_limit != 0 else None)\n                            #     .collect(streaming=True)\n                            # )\n                            # text_column_index = df.columns.index(self.get_output_text_field())\n\n                            df = pl.read_parquet(\n                                file_path, low_memory=True, columns=[self.get_output_text_field()]\n                            ).slice(offset=file_offset, length=file_limit if file_limit != 0 else None)\n                            text_column_index = 0\n\n                            # Iterate over rows\n                            for row in df.iter_rows():\n                                text = row[text_column_index]\n\n                                if cast_to_py_string:\n                                    text = str(text)\n\n                                yield text\n                                rows += 1\n\n                                if rows_limit &gt; 0 and rows &gt;= rows_limit:\n                                    # break row loop\n                                    break\n                            else:\n                                raise ValueError(\"Invalid `reader_implementation`\")\n                    else:\n                        logger.debug(\"Skip this file because output does not contain the requested rows: %s\", file_path)\n\n                    # current_offset += file_rows_count  # TODO +1?\n                    chunk_start = chunk_end + 1  # set start for the next chunk\n\n                if rows_limit &gt; 0 and rows &gt;= rows_limit:\n                    # break file loop\n                    logger.debug(\"break file loop\")\n                    break\n        else:\n            logger.warning(\"Cannot generate texts because output files do not exist.\")\n\n        logger.info(\n            \"Texts generated: %s (expected size: %s; offset: %s; limit: %s;)\", rows, limit - offset, offset, limit\n        )\n\n    def get_estimated_bytes_from_output(self, shuffled: bool = False, read_first_n_rows: int = 1_000) -&gt; int:\n        \"\"\"Estimate byte size of output text:\n        - read first N rows of shuffled output files and count their byte size\n        - multiply counted bytes by total number of rows\n        \"\"\"\n        if not shuffled:\n            raise NotImplementedError\n\n        if self.output_format != \"parquet\":\n            raise NotImplementedError\n\n        bytes_sum = 0\n        total_rows = 0\n\n        # iterate over output files (use shuffled files for a better estimate)\n        for output_path in self.get_output_file_paths(shuffled=shuffled):\n            if os.path.exists(output_path):\n                # read the first n rows\n                df = pl.scan_parquet(\n                    output_path,\n                    low_memory=True,\n                    n_rows=read_first_n_rows,\n                ).collect(streaming=True)\n                for row in df.iter_rows():\n                    text = str(row[0])\n                    bytes_sum += len(text.encode(\"utf-8\"))  # count the byte size of the text\n\n                # read total row count from metadata\n                with open(output_path, \"rb\") as f:\n                    parquet_file = pq.ParquetFile(\n                        f,\n                        # increased to avoid OSErrors\n                        thrift_string_size_limit=1000000000,  # default: 100000000\n                        thrift_container_size_limit=10000000,  # default: 1000000\n                    )\n                    total_rows += parquet_file.metadata.num_rows\n\n        # estimated bytes\n        bytes_per_row = bytes_sum / read_first_n_rows\n        total_bytes = int(total_rows * bytes_per_row)\n\n        return total_bytes\n\n    def get_sampling_factor(self) -&gt; float:\n        \"\"\"Sampling is defined based on dataset ID, source ID, or language.\"\"\"\n        if self.config:\n            if self.DATASET_ID in self.config.sampling_factor_by_dataset_id:\n                return self.config.sampling_factor_by_dataset_id[self.DATASET_ID]\n\n            if self.get_source_id() in self.config.sampling_factor_by_source_id:\n                return self.config.sampling_factor_by_source_id[self.get_source_id()]\n\n            if self.get_language_code() in self.config.sampling_factor_by_language:\n                return self.config.sampling_factor_by_language[self.get_language_code()]\n\n        return 1.0  # default factor\n\n    def is_selected(self) -&gt; bool:\n        \"\"\"Is this dataset part of selected datasets or sources?\"\"\"\n        if (\n            self.DATASET_ID in self.config.selected_dataset_ids\n            or self.get_source_id() in self.config.selected_source_ids\n        ):\n            return True\n        else:\n            # try fnmatch\n            for pattern in self.config.get_selected_dataset_ids(mode=\"fnmatch\"):\n                if fnmatch.fnmatch(self.DATASET_ID, pattern):\n                    return True\n\n            return False\n\n    def get_shuffled_output_file_path(self, unshuffled_output_file_path: str) -&gt; str:\n        output_file_name = Path(unshuffled_output_file_path).name\n\n        return os.path.join(\n            self.config.shuffled_datasets_dir, output_file_name.replace(\".parquet\", \".shuffled.parquet\")\n        )\n\n    def save_stats(self):\n        \"\"\"Save the processing statistics (counter) into a JSON file in the output directory.\"\"\"\n        if self.counter is None:\n            logger.error(\"Cannot save statistics because none were recorded.\")\n            return\n\n        date_format = \"%Y-%m-%d_%H%M%S\"\n        self.end_time = datetime.datetime.now()\n        short_uuid = str(uuid.uuid4())[:5]\n        stats_file_name = f\"stats_{self.end_time.strftime(date_format)}_{short_uuid}.{self.config.get_job_id()}.json\"\n        stats_file_path = os.path.join(self.get_output_dir(), stats_file_name)\n\n        stats = {\n            \"counter\": dict(self.counter),\n            \"start_time\": self.start_time.strftime(date_format),\n            \"end_time\": self.end_time.strftime(date_format),\n            \"job_id\": self.config.get_job_id(),\n            # \"config\": self.config,\n        }\n\n        with open(stats_file_path, \"w\") as f:\n            json.dump(stats, f, indent=4)\n\n        logger.info(f\"Statistics saved to {stats_file_path}\")\n\n        return stats_file_path\n</code></pre>"},{"location":"api/base_dataset/#llm_datasets.datasets.base.BaseDataset.filter_documents","title":"<code>filter_documents(documents)</code>","text":"<p>Applies basic filtering on the texts before saving</p> Source code in <code>src/llm_datasets/datasets/base.py</code> <pre><code>def filter_documents(self, documents: Iterable[Document]):\n    \"\"\"Applies basic filtering on the texts before saving\"\"\"\n    for doc in documents:\n        if self.min_length &gt; 0 and len(doc.text) &lt; self.min_length:\n            # skip because of short text length\n            self.counter.update({\"filtered_short_text\": 1})\n            continue\n\n        yield doc\n</code></pre>"},{"location":"api/base_dataset/#llm_datasets.datasets.base.BaseDataset.filter_texts","title":"<code>filter_texts(texts)</code>","text":"<p>Applies basic filtering on the texts before saving</p> Source code in <code>src/llm_datasets/datasets/base.py</code> <pre><code>def filter_texts(self, texts: Iterable[str]):\n    \"\"\"Applies basic filtering on the texts before saving\"\"\"\n    for text in texts:\n        if self.min_length &gt; 0 and len(text) &lt; self.min_length:\n            # skip because of short text length\n            self.counter.update({\"filtered_short_text\": 1})\n            continue\n\n        yield text\n</code></pre>"},{"location":"api/base_dataset/#llm_datasets.datasets.base.BaseDataset.generate_texts_from_output","title":"<code>generate_texts_from_output(shuffled=False, batch_size=None, limit=0, offset=0, shuffle_output_file_paths=False, reader_implementation='pyarrow', cast_to_py_string=False)</code>","text":"<p>A iterator over texts from processed output files.</p> Source code in <code>src/llm_datasets/datasets/base.py</code> <pre><code>def generate_texts_from_output(\n    self,\n    shuffled: bool = False,\n    batch_size: Optional[int] = None,\n    limit: int = 0,\n    offset: int = 0,\n    shuffle_output_file_paths: bool = False,\n    reader_implementation: Literal[\"polars_read_parquet\", \"pyarrow\"] = \"pyarrow\",\n    cast_to_py_string: bool = False,\n) -&gt; Iterable[Union[str, pa.StringScalar]]:\n    \"\"\"A iterator over texts from processed output files.\"\"\"\n    if batch_size is None:\n        batch_size = self.output_batch_size\n\n    if self.output_format != \"parquet\":\n        raise ValueError(f\"Cannot generate texts with {self.output_format=}\")\n\n    # Check if output files exists and sort them\n    output_paths = [\n        file_path\n        for file_path in sorted(self.get_output_file_paths(shuffled=shuffled))\n        if os.path.exists(file_path)\n    ]\n\n    # Count generated rows\n    rows = 0\n    rows_limit = limit - offset\n\n    # if limit &gt; 0:\n    #     batch_size = min(batch_size, limit)\n\n    # Shuffle output chunks:\n    # This changes the order in that the chunks are read ensure also shuffling on the full dataset level.\n    if shuffle_output_file_paths:\n        random.seed(self.config.seed)  # reset seed to avoid inference by other scripts\n        random.shuffle(output_paths)\n\n    chunk_start = 0\n    chunk_end = None\n\n    if output_paths:\n        for file_path in output_paths:\n            logger.info(\"Generating text from %s\", file_path)\n\n            # PyArrow implementation\n            with open(file_path, \"rb\") as file_handler:\n                pq_file = pq.ParquetFile(\n                    file_handler,\n                    # memory_map=False,\n                )\n                file_rows_count = pq_file.metadata.num_rows\n\n                chunk_end = chunk_start + file_rows_count - 1\n\n                # Should we read from the current chunk?\n                # Yes, if\n                # - offset is smaller or equal chunk_start\n                # (- limit is greater or equal chunk_end) --- limit does not matter\n\n                # variants\n                # A) requested rows start in chunk and ends in chunk\n                # B) requested rows start in chunk but ends in following chunk\n                # C) requested rows start before chunk and ends in chunk\n                # D) requested rows start before chunk and ends in following chunk\n\n                if (\n                    chunk_start &lt;= offset &lt; chunk_end\n                    or offset &lt; chunk_start\n                    and (limit == 0 or chunk_start &lt; limit)\n                ):\n                    file_offset = max(\n                        0, offset - chunk_start\n                    )  # global offset minus start of current file (current chunk)\n                    file_limit = (\n                        max(0, limit - chunk_start) if limit &gt; 0 else 0  # limit - chunk_start\n                    )  # Length of the slice: global limit minus start of current chunk\n                    # TODO before: limit - chunk_start - file_offset\n\n                    logger.debug(\n                        \"Reading file chunk from %s: file [%s - %s]; global [%s - %s]; chunk [%s - %s]\",\n                        file_path,\n                        file_offset,\n                        file_limit,\n                        offset,\n                        limit,\n                        chunk_start,\n                        chunk_end,\n                    )\n                    if reader_implementation == \"pyarrow\":\n                        # PyArrow implementation with iter_batches\n                        # with open(file_path, \"rb\") as file_handler:\n                        #     parquet_file = pq.ParquetFile(file_handler)\n\n                        for batch_idx, pq_batch in enumerate(\n                            pq_file.iter_batches(\n                                columns=[self.get_output_text_field()], batch_size=batch_size, use_threads=False\n                            )\n                        ):\n                            for row_idx, text_column in enumerate(pq_batch.columns[0], batch_idx * batch_size):\n                                if row_idx &gt;= file_offset:\n                                    if rows_limit &gt; 0 and rows &gt;= rows_limit:\n                                        # break row loop\n                                        logger.debug(\"break row loop\")\n                                        break\n\n                                    text: pa.StringScalar = text_column\n\n                                    if cast_to_py_string:\n                                        # cast to string\n                                        text = text_column.as_py()\n\n                                    yield text\n                                    rows += 1\n\n                            if rows_limit &gt; 0 and rows &gt;= rows_limit:\n                                # break batch loop\n                                logger.debug(\"break batch loop\")\n                                break\n\n                        # PyArrow implementation with read_row_group\n                        # with open(file_path, \"rb\") as file_handler:\n                        #     parquet_file = pq.ParquetFile(file_handler)\n\n                        #     # 1. What row groups need to be read?\n                        #     row_groups, group_idx_to_offset_last_row = get_selected_row_groups(\n                        #         parquet_file, file_offset, file_limit\n                        #     )\n                        #     logger.debug(\"Selected row groups: %s; %s\", row_groups, group_idx_to_offset_last_row)\n\n                        #     # 2. Read selected row groups\n                        #     for selected_row_group in row_groups:\n                        #         logger.debug(\"Read row group: %s\", selected_row_group)\n                        #         group_table = parquet_file.read_row_group(\n                        #             selected_row_group, columns=[self.get_output_text_field()]\n                        #         )\n\n                        #         # What offsets and limit? (only if needed)\n                        #         if group_idx_to_offset_last_row is not None:\n                        #             group_offset, _ = group_idx_to_offset_last_row[selected_row_group]\n\n                        #             row_offset = max(0, file_offset - group_offset)\n                        #             logger.debug(\"Row group: %s; row offset: %s\", selected_row_group, row_offset)\n\n                        #         # Iterate over rows\n                        #         for row_idx, text_column in enumerate(group_table.columns[0]):\n                        #             # Skip rows before offset\n                        #             if group_idx_to_offset_last_row is None or row_idx &gt;= row_offset:\n                        #                 if rows_limit &gt; 0 and rows &gt;= rows_limit:\n                        #                     # break row loop\n                        #                     logger.debug(\"break row loop\")\n                        #                     break\n\n                        #                 text = text_column.as_py()  # cast to str\n                        #                 yield text\n                        #                 rows += 1\n\n                        #         if rows_limit &gt; 0 and rows &gt;= rows_limit:\n                        #             # break row group loop\n                        #             logger.debug(\"break row group loop\")\n                        #             break\n\n                    elif reader_implementation == \"polars_read_parquet\":\n                        # Polars \"scan_parquet\" implementation: Error \"Segmentation fault (core dumped)\"\n                        # df = (\n                        #     pl.scan_parquet(file_path, low_memory=True).collect(\n                        #     streaming=True\n                        # ).slice(offset=file_offset, length=file_limit if file_limit != 0 else None)\n                        #     .collect(streaming=True)\n                        # )\n                        # text_column_index = df.columns.index(self.get_output_text_field())\n\n                        df = pl.read_parquet(\n                            file_path, low_memory=True, columns=[self.get_output_text_field()]\n                        ).slice(offset=file_offset, length=file_limit if file_limit != 0 else None)\n                        text_column_index = 0\n\n                        # Iterate over rows\n                        for row in df.iter_rows():\n                            text = row[text_column_index]\n\n                            if cast_to_py_string:\n                                text = str(text)\n\n                            yield text\n                            rows += 1\n\n                            if rows_limit &gt; 0 and rows &gt;= rows_limit:\n                                # break row loop\n                                break\n                        else:\n                            raise ValueError(\"Invalid `reader_implementation`\")\n                else:\n                    logger.debug(\"Skip this file because output does not contain the requested rows: %s\", file_path)\n\n                # current_offset += file_rows_count  # TODO +1?\n                chunk_start = chunk_end + 1  # set start for the next chunk\n\n            if rows_limit &gt; 0 and rows &gt;= rows_limit:\n                # break file loop\n                logger.debug(\"break file loop\")\n                break\n    else:\n        logger.warning(\"Cannot generate texts because output files do not exist.\")\n\n    logger.info(\n        \"Texts generated: %s (expected size: %s; offset: %s; limit: %s;)\", rows, limit - offset, offset, limit\n    )\n</code></pre>"},{"location":"api/base_dataset/#llm_datasets.datasets.base.BaseDataset.get_compression_from_output_files","title":"<code>get_compression_from_output_files(shuffled=False)</code>","text":"<p>NOTE: Currently only implemented for <code>parquet</code> format.</p> Source code in <code>src/llm_datasets/datasets/base.py</code> <pre><code>def get_compression_from_output_files(self, shuffled=False):\n    \"\"\"NOTE: Currently only implemented for `parquet` format.\"\"\"\n    if self.output_format == \"parquet\":\n        for output_path in self.get_output_file_paths(shuffled=shuffled):\n            if os.path.exists(output_path):\n                with open(output_path, \"rb\") as f:\n                    parquet_file = pq.ParquetFile(\n                        f,\n                        # increased to avoid OSErrors\n                        thrift_string_size_limit=1000000000,  # default: 100000000\n                        thrift_container_size_limit=10000000,  # default: 1000000\n                    )\n                    parquet_metadata = parquet_file.metadata\n                    for i in range(parquet_metadata.num_row_groups):\n                        for j in range(parquet_metadata.num_columns):\n                            return parquet_file.metadata.row_group(i).column(j).compression\n\n    return \"unknown\"\n</code></pre>"},{"location":"api/base_dataset/#llm_datasets.datasets.base.BaseDataset.get_estimated_bytes_from_output","title":"<code>get_estimated_bytes_from_output(shuffled=False, read_first_n_rows=1000)</code>","text":"<p>Estimate byte size of output text: - read first N rows of shuffled output files and count their byte size - multiply counted bytes by total number of rows</p> Source code in <code>src/llm_datasets/datasets/base.py</code> <pre><code>def get_estimated_bytes_from_output(self, shuffled: bool = False, read_first_n_rows: int = 1_000) -&gt; int:\n    \"\"\"Estimate byte size of output text:\n    - read first N rows of shuffled output files and count their byte size\n    - multiply counted bytes by total number of rows\n    \"\"\"\n    if not shuffled:\n        raise NotImplementedError\n\n    if self.output_format != \"parquet\":\n        raise NotImplementedError\n\n    bytes_sum = 0\n    total_rows = 0\n\n    # iterate over output files (use shuffled files for a better estimate)\n    for output_path in self.get_output_file_paths(shuffled=shuffled):\n        if os.path.exists(output_path):\n            # read the first n rows\n            df = pl.scan_parquet(\n                output_path,\n                low_memory=True,\n                n_rows=read_first_n_rows,\n            ).collect(streaming=True)\n            for row in df.iter_rows():\n                text = str(row[0])\n                bytes_sum += len(text.encode(\"utf-8\"))  # count the byte size of the text\n\n            # read total row count from metadata\n            with open(output_path, \"rb\") as f:\n                parquet_file = pq.ParquetFile(\n                    f,\n                    # increased to avoid OSErrors\n                    thrift_string_size_limit=1000000000,  # default: 100000000\n                    thrift_container_size_limit=10000000,  # default: 1000000\n                )\n                total_rows += parquet_file.metadata.num_rows\n\n    # estimated bytes\n    bytes_per_row = bytes_sum / read_first_n_rows\n    total_bytes = int(total_rows * bytes_per_row)\n\n    return total_bytes\n</code></pre>"},{"location":"api/base_dataset/#llm_datasets.datasets.base.BaseDataset.get_output_rows_count","title":"<code>get_output_rows_count(shuffled=False)</code>","text":"<p>Read metadata from parquet files and extract number of rows</p> Source code in <code>src/llm_datasets/datasets/base.py</code> <pre><code>def get_output_rows_count(self, shuffled: bool = False) -&gt; int:\n    \"\"\"Read metadata from parquet files and extract number of rows\"\"\"\n    if self.output_format == \"parquet\":\n        output_paths = list(self.get_output_file_paths(shuffled=shuffled))\n\n        # Filter for existing\n        output_paths = [output_path for output_path in output_paths if os.path.exists(output_path)]\n\n        if output_paths:\n            rows_count = 0\n\n            for output_path in output_paths:\n                with open(output_path, \"rb\") as f:\n                    parquet_file = pq.ParquetFile(\n                        f,\n                        # increased to avoid OSErrors\n                        thrift_string_size_limit=1000000000,  # default: 100000000\n                        thrift_container_size_limit=10000000,  # default: 1000000\n                    )\n                    rows_count += parquet_file.metadata.num_rows\n\n                    logger.debug(\"Rows = %s in %s\", rows_count, output_path)\n\n            return rows_count\n\n        logger.debug(\"No output files exists for %s\", self.DATASET_ID)\n        return -1\n    else:\n        raise ValueError(f\"Cannot determine the output rows count with {self.output_format=}\")\n</code></pre>"},{"location":"api/base_dataset/#llm_datasets.datasets.base.BaseDataset.get_sampling_factor","title":"<code>get_sampling_factor()</code>","text":"<p>Sampling is defined based on dataset ID, source ID, or language.</p> Source code in <code>src/llm_datasets/datasets/base.py</code> <pre><code>def get_sampling_factor(self) -&gt; float:\n    \"\"\"Sampling is defined based on dataset ID, source ID, or language.\"\"\"\n    if self.config:\n        if self.DATASET_ID in self.config.sampling_factor_by_dataset_id:\n            return self.config.sampling_factor_by_dataset_id[self.DATASET_ID]\n\n        if self.get_source_id() in self.config.sampling_factor_by_source_id:\n            return self.config.sampling_factor_by_source_id[self.get_source_id()]\n\n        if self.get_language_code() in self.config.sampling_factor_by_language:\n            return self.config.sampling_factor_by_language[self.get_language_code()]\n\n    return 1.0  # default factor\n</code></pre>"},{"location":"api/base_dataset/#llm_datasets.datasets.base.BaseDataset.is_selected","title":"<code>is_selected()</code>","text":"<p>Is this dataset part of selected datasets or sources?</p> Source code in <code>src/llm_datasets/datasets/base.py</code> <pre><code>def is_selected(self) -&gt; bool:\n    \"\"\"Is this dataset part of selected datasets or sources?\"\"\"\n    if (\n        self.DATASET_ID in self.config.selected_dataset_ids\n        or self.get_source_id() in self.config.selected_source_ids\n    ):\n        return True\n    else:\n        # try fnmatch\n        for pattern in self.config.get_selected_dataset_ids(mode=\"fnmatch\"):\n            if fnmatch.fnmatch(self.DATASET_ID, pattern):\n                return True\n\n        return False\n</code></pre>"},{"location":"api/base_dataset/#llm_datasets.datasets.base.BaseDataset.save_stats","title":"<code>save_stats()</code>","text":"<p>Save the processing statistics (counter) into a JSON file in the output directory.</p> Source code in <code>src/llm_datasets/datasets/base.py</code> <pre><code>def save_stats(self):\n    \"\"\"Save the processing statistics (counter) into a JSON file in the output directory.\"\"\"\n    if self.counter is None:\n        logger.error(\"Cannot save statistics because none were recorded.\")\n        return\n\n    date_format = \"%Y-%m-%d_%H%M%S\"\n    self.end_time = datetime.datetime.now()\n    short_uuid = str(uuid.uuid4())[:5]\n    stats_file_name = f\"stats_{self.end_time.strftime(date_format)}_{short_uuid}.{self.config.get_job_id()}.json\"\n    stats_file_path = os.path.join(self.get_output_dir(), stats_file_name)\n\n    stats = {\n        \"counter\": dict(self.counter),\n        \"start_time\": self.start_time.strftime(date_format),\n        \"end_time\": self.end_time.strftime(date_format),\n        \"job_id\": self.config.get_job_id(),\n        # \"config\": self.config,\n    }\n\n    with open(stats_file_path, \"w\") as f:\n        json.dump(stats, f, indent=4)\n\n    logger.info(f\"Statistics saved to {stats_file_path}\")\n\n    return stats_file_path\n</code></pre>"},{"location":"api/base_dataset/#llm_datasets.datasets.base.BaseDataset.save_texts","title":"<code>save_texts(texts, append=False)</code>","text":"<p>Save texts in different formats</p> Source code in <code>src/llm_datasets/datasets/base.py</code> <pre><code>def save_texts(self, texts: Iterable[str], append: bool = False):\n    \"\"\"Save texts in different formats\"\"\"\n    if self.has_output_files() and not self.override_output:\n        raise FileExistsError(f\"Output exists already (override not enabled): {self.get_output_file_paths()}\")\n\n    if self.output_format == \"jsonl\":\n        docs_count = self.save_texts_to_jsonl(texts, append=append)\n\n    elif self.output_format == \"parquet\":\n        if append:\n            raise NotImplementedError(\"Appending is not supported by parquet output format\")\n\n        docs_count, saved_chunks = self.save_texts_to_parquet(texts)\n\n        self.counter.update({\"saved_chunks\": saved_chunks})\n    else:\n        raise ValueError(f\"Unsupported output format: {self.output_format}\")\n\n    logger.info(f\"Documents saved: {docs_count:,}\")\n\n    self.counter.update({\"docs_count\": docs_count})\n\n    if docs_count == 0:\n        logger.warning(\"No documents have been saved!\")\n\n        # delete empty output file\n        if self.has_output_files():\n            self.remove_texts()\n\n    return docs_count\n</code></pre>"},{"location":"api/base_dataset/#llm_datasets.datasets.base.BaseDataset.save_texts_to_jsonl","title":"<code>save_texts_to_jsonl(texts, append=False)</code>","text":"<p>Write JSONL files to /.jsonl (each line is a JSON object with \"doc\" field and text as plain text) Source code in <code>src/llm_datasets/datasets/base.py</code> <pre><code>def save_texts_to_jsonl(self, texts: Iterable[str], append: bool = False):\n    \"\"\"Write JSONL files to &lt;output_dir&gt;/&lt;DATASET_ID&gt;.jsonl\n    (each line is a JSON object with \"doc\" field and text as plain text)\n    \"\"\"\n    mode = \"a\" if append else \"w\"\n    fp = self.get_output_file_paths(single=True)[0]\n\n    # Save as JSONL\n    logger.info(f\"Writing JSONL output to {fp} ({mode=})\")\n\n    docs_count = 0\n\n    with smart_open(fp, mode) as f:\n        for docs_count, text in enumerate(self.filter_texts(texts), 1):\n            f.write(json.dumps({self.get_output_text_field(): text}, ensure_ascii=self.json_ensure_ascii) + \"\\n\")\n\n            if docs_count &gt; 0 and (docs_count % self.print_write_progress) == 0:\n                logger.info(f\"Written {docs_count:,} docs ...\")\n\n            if self.limit &gt; 0 and docs_count &gt;= self.limit:\n                logger.warning(f\"Limit reached ({docs_count:,} docs)\")\n\n                if hasattr(texts, \"terminate\"):\n                    logger.info(\"Killing all remaining workers, if any\")\n                    texts.terminate()\n                break\n\n    if hasattr(texts, \"terminate\"):\n        logger.info(\"Killing all remaining workers, if any (iterator end)\")\n        texts.terminate()\n\n    return docs_count\n</code></pre>"},{"location":"api/base_dataset/#llm_datasets.datasets.base.BaseDataset.save_texts_to_parquet","title":"<code>save_texts_to_parquet(texts, file_path=None, apply_filter=True)</code>","text":"<p>Save text in parquet (single column schema, in batches)</p> Source code in <code>src/llm_datasets/datasets/base.py</code> <pre><code>def save_texts_to_parquet(self, texts: Iterable[str], file_path: Optional[str] = None, apply_filter: bool = True):\n    \"\"\"Save text in parquet (single column schema, in batches)\"\"\"\n    assert self.output_format == \"parquet\"\n\n    if file_path is None:\n        file_path = self.get_output_file_paths(single=True)[0]\n\n    if apply_filter:\n        texts = self.filter_texts_or_documents(texts)\n\n    if self.config.use_documents:\n        # document schema\n        schema = self.get_document_schema().get_pa_schema()\n    else:\n        # text-only schema\n        schema = pa.schema(\n            [\n                (self.get_output_text_field(), pa.string()),\n            ]\n        )\n\n    # Max. chunk size is multiplied with this factor\n    # (to account for inaccurate chunk sizes due to batching)\n    safety_factor = 0.975\n\n    # Save as Parquet file\n    logger.info(f\"Writing parquet output ({self.output_batch_size=}; {self.limit=}; {self.output_compression=})\")\n\n    saved_docs, saved_chunks = save_texts_to_parquet_chunks(\n        texts=texts,\n        schema=schema,\n        max_chunk_uncompressed_bytes=(\n            self.max_output_chunk_uncompressed_bytes * safety_factor\n            if self.max_output_chunk_uncompressed_bytes is not None\n            else None\n        ),\n        max_chunk_rows=self.max_output_chunk_rows,\n        output_path_func=self.get_single_or_chunked_output_file_path,\n        compression=get_parquet_compression(self.output_compression),\n        batch_size=self.output_batch_size,\n        print_write_progress=self.print_write_progress,\n        limit=self.limit,\n    )\n\n    if hasattr(texts, \"terminate\"):\n        logger.info(\"Killing all remaining workers, if any (iterator end)\")\n        texts.terminate()\n\n    return saved_docs, saved_chunks\n</code></pre>"},{"location":"api/config/","title":"Config","text":"<p>               Bases: <code>object</code></p> Source code in <code>src/llm_datasets/utils/config.py</code> <pre><code>class Config(object):\n    text_datasets_dir = None\n    output_format = \"jsonl\"\n    output_compression = None\n\n    raw_datasets_dir = None\n    shuffled_datasets_dir = None\n\n    composed_dataset_dir = (\n        None  # composed dataset (train/val split) is saved into this directory\n    )\n    local_dirs_by_dataset_id = {}\n    local_dirs_by_source_id = {}\n    sampling_factor_by_dataset_id = {}\n    sampling_factor_by_source_id = {}\n    sampling_factor_by_language = {}\n\n    only_selected_datasets: bool = False\n    selected_dataset_ids: List[str] = []\n    selected_source_ids: List[str] = []\n\n    validation_ratio = 0.005  # number of documents in the split: len(dataset) * ratio\n    validation_min_total_docs = (\n        1_000  # to be used as validation set, the dataset must have at least n docs\n    )\n    validation_max_split_docs = (\n        1_000  # number of documents in validation split are capped at this numbers\n    )\n    validation_min_split_docs = 10  # split must have at least this number of documents, otherwise it will be discarded\n    tokenizer_train_ratio = 0.1  # % of train data used for tokenizer training\n\n    # Vocab size should divisble by 8\n    # - Jan's recommendation: 250680\n    # - NVIDIA recommendation for multilingual models: 256000\n    tokenizer_vocab_size: int = 256000\n    tokenizer_model_type: Literal[\n        \"bpe\", \"unigram\", \"word\", \"char\"\n    ] = \"bpe\"  # SP model types\n\n    seed: int = 0\n\n    extra_dataset_registries: Union[None, str, List[str]] = None\n    extra_dataset_classes: Union[None, List] = None\n    use_default_dataset_registry: bool = True\n\n    # Datasets are initialized with these kwargs\n    extra_dataset_kwargs: dict[str, dict] = {}\n\n    use_documents: bool = False\n    workers: int = 0\n    limit: int = 0\n    skip_items = 0\n    job_id = None\n    save_stats = True\n    verbose = False\n    log_file = None\n    override = False\n\n    def __init__(self, **entries):\n        self.__dict__.update(entries)\n\n    def init_logger(self, logger_name):\n        log_handlers = [logging.StreamHandler()]\n\n        if self.log_file:\n            log_handlers.append(logging.FileHandler(self.log_file))\n\n        logging.basicConfig(\n            format=\"%(asctime)s - %(levelname)s - %(name)s -   %(message)s\",\n            datefmt=\"%Y-%m-%d %H:%M:%S\",\n            level=logging.DEBUG if self.verbose else logging.INFO,\n            handlers=log_handlers,\n        )\n        logger = logging.getLogger(logger_name)\n\n        return logger\n\n    def get_extra_dataset_kwargs(self, dataset_id) -&gt; dict:\n        try:\n            return self.extra_dataset_kwargs[dataset_id]\n        except KeyError:\n            return {}\n\n    def get_selected_dataset_ids(\n        self, mode: Literal[\"all\", \"exact\", \"fnmatch\"] = \"all\"\n    ):\n        if mode == \"exact\":\n            # only ids for exact match\n            return [\n                s for s in self.selected_dataset_ids if \"*\" not in s and \"?\" not in s\n            ]\n        elif mode == \"fnmatch\":\n            # only ids for fnmatch\n            return [s for s in self.selected_dataset_ids if \"*\" in s or \"?\" in s]\n        else:\n            # all\n            return self.selected_dataset_ids\n\n    def get_job_id(self) -&gt; Union[None, str]:\n        \"\"\"Returns manually set job ID or from environment variable (SLURM_JOBID)\"\"\"\n        if self.job_id is None:\n            self.job_id = os.environ.get(\"SLURM_JOBID\", \"0\")\n\n        return self.job_id\n\n    def get_key_value_pairs(self, keys: Iterable) -&gt; Dict:\n        return {k: getattr(self, k) for k in keys}\n</code></pre>"},{"location":"api/config/#llm_datasets.utils.config.Config.get_job_id","title":"<code>get_job_id()</code>","text":"<p>Returns manually set job ID or from environment variable (SLURM_JOBID)</p> Source code in <code>src/llm_datasets/utils/config.py</code> <pre><code>def get_job_id(self) -&gt; Union[None, str]:\n    \"\"\"Returns manually set job ID or from environment variable (SLURM_JOBID)\"\"\"\n    if self.job_id is None:\n        self.job_id = os.environ.get(\"SLURM_JOBID\", \"0\")\n\n    return self.job_id\n</code></pre>"},{"location":"api/hf_dataset/","title":"HFDataset","text":"<p>               Bases: <code>BaseDocumentDataset</code></p> Source code in <code>src/llm_datasets/datasets/hf_dataset.py</code> <pre><code>class HFDataset(BaseDocumentDataset):\n    HF_DATASET_ID: str = None\n    HF_DATASET_SPLIT: Optional[str] = None\n    HF_DATASET_CONFIGS: Optional[List[str]] = None\n    HF_DATA_DIR = None\n    HF_KWARGS = None\n    HF_REVISION: Optional[str] = None\n\n    config_to_dataset: Optional[Dict] = None\n    id_column_name = None\n    text_column_name = \"text\"\n    title_column_name = None\n    metadata_column_names = None\n    remove_columns = None\n    streaming = False\n    keep_columns = False\n\n    def __init__(self, **kwargs) -&gt; None:\n        super().__init__(**kwargs)\n\n        if self.HF_DATASET_ID is None:\n            raise ValueError(\"HF_DATASET_ID is not set\")\n\n    def get_hf_configs(self):\n        if self.HF_DATASET_CONFIGS:\n            return self.HF_DATASET_CONFIGS\n        else:\n            # if no config is used\n            return [None]\n\n    def download(self):\n        self.config_to_dataset = {}\n\n        for hf_config in self.get_hf_configs():\n            logger.info(f\"Downloading for {hf_config=}\")\n\n            if self.HF_KWARGS:\n                # use additional kwargs as defined by dataset class\n                ds = load_dataset(\n                    self.HF_DATASET_ID,\n                    hf_config,\n                    split=self.HF_DATASET_SPLIT,\n                    data_dir=self.HF_DATA_DIR,\n                    streaming=self.streaming,\n                    use_auth_token=self.get_hf_auth_token(),\n                    keep_in_memory=False,\n                    revision=self.HF_REVISION,\n                    **self.HF_KWARGS,\n                )\n            else:\n                ds = load_dataset(\n                    self.HF_DATASET_ID,\n                    hf_config,\n                    split=self.HF_DATASET_SPLIT,\n                    data_dir=self.HF_DATA_DIR,\n                    streaming=self.streaming,\n                    use_auth_token=self.get_hf_auth_token(),\n                    keep_in_memory=False,\n                    revision=self.HF_REVISION,\n                )\n\n            # check dataset split\n            if isinstance(ds, DatasetDict) and not self.HF_DATASET_SPLIT:\n                logger.warning(f\"HF returned DatasetDict but split not set: {DatasetDict}\")\n\n            if self.limit &gt; 0:\n                if self.streaming:\n                    logger.warning(f\"Limit requested ({self.limit=}) but streaming is enabled!\")\n                else:\n                    logger.warning(f\"Limiting dataset to: {self.limit}\")\n                    ds = ds.select(range(self.limit))\n\n            if self.remove_columns is not None:\n                logger.info(f\"Removing columns (at download): {self.remove_columns}\")\n\n                ds = ds.remove_columns(self.remove_columns)\n\n            filter_func = self.get_filter_func()\n            if filter_func:\n                logger.info(f\"Dataset size before filter: {len(ds):,}\")\n\n                ds = ds.filter(filter_func, num_proc=self.workers)\n\n                logger.info(f\"Dataset size after filter: {len(ds):,}\")\n\n            self.config_to_dataset[hf_config] = ds\n\n    def get_filter_func(self):\n        return None\n\n    def get_document_from_item(self, item, index: Optional[int] = None) -&gt; Document:\n        return Document(\n            text=item[self.text_column_name],\n            id=item[self.id_column_name] if self.id_column_name else index,\n            metadata={col: item[col] for col in self.metadata_column_names} if self.metadata_column_names else {},\n        )\n\n    def prepend_title(self, example):\n        example[self.text_column_name] = (\n            example[self.title_column_name] + self.title_delimiter + example[self.text_column_name]\n        )\n\n        return example\n\n    def get_documents(self) -&gt; Iterable[Document]:\n        self.download()\n        doc_idx = 0\n        # drop all non-text columns\n        for ds_idx, config in enumerate(self.config_to_dataset):\n            # remove non-text and non-title columns\n            if not self.keep_columns:\n                columns_to_remove = set(self.config_to_dataset[config].column_names) - {self.text_column_name}\n\n                if self.title_column_name:\n                    columns_to_remove = columns_to_remove - {self.title_column_name}\n\n                logger.info(\"Removing columns (get texts): %s\", columns_to_remove)\n\n                self.config_to_dataset[config] = self.config_to_dataset[config].remove_columns(columns_to_remove)\n\n            if self.title_column_name:\n                logger.info(f\"Prepending title to text column ({self.title_column_name=})\")\n\n                self.config_to_dataset[config] = self.config_to_dataset[config].map(self.prepend_title)\n\n                # remove title column\n                self.config_to_dataset[config] = self.config_to_dataset[config].remove_columns([self.title_column_name])\n\n            ds_iterator = iter(self.config_to_dataset[config])\n\n            for item in ds_iterator:\n                if hasattr(self, \"get_documents_from_item\"):\n                    # multiple documents from a single item\n                    yield from self.get_documents_from_item(item)\n                else:\n                    yield self.get_document_from_item(item, doc_idx)\n                    doc_idx += 1\n</code></pre>"},{"location":"api/jsonl_dataset/","title":"BaseDataset","text":"<p>               Bases: <code>JSONLMixin</code>, <code>BaseTextDataset</code></p> Source code in <code>src/llm_datasets/datasets/jsonl_dataset.py</code> <pre><code>class JSONLDataset(JSONLMixin, BaseTextDataset):  # TODO rename to JSONLTextDataset\n    def get_text_from_item(self, item) -&gt; str:\n        \"\"\"This simply returns the text field from item (but dataset classes can override this to implement filtering etc.)\"\"\"\n        return item[self.raw_jsonl_text_field]\n\n    def get_document_from_item(self, item) -&gt; Document:\n        \"\"\"This simply returns the document with a text field from item (but dataset classes can override this to implement filtering etc.)\"\"\"\n        return Document(text=item[self.raw_jsonl_text_field])\n\n    def get_texts_from_file_handler(self, file_handler):\n        if hasattr(self.config, \"use_documents\") and self.config.use_documents:\n            getter_func = self.get_document_from_item\n        else:\n            getter_func = self.get_text_from_item\n\n        for line in file_handler:\n            item = json.loads(line)\n            text = getter_func(item)\n\n            if text:\n                yield text\n\n    def get_texts_from_file_path(self, file_path: str | Path):\n        logger.info(f\"Reading from {file_path}\")\n\n        if (\n            isinstance(file_path, str) and file_path.endswith(\".zst\")\n        ) or file_path.suffix == \".zst\":  # zstd compression\n            with open(file_path, \"rb\") as zf:\n                dctx = zstd.ZstdDecompressor()  # uncompress zstd\n                with dctx.stream_reader(zf) as reader:\n                    f = io.BufferedReader(reader)\n                    yield from self.get_texts_from_file_handler(f)\n        else:\n            with open(file_path) as f:  # jsonl or jsonl.fz (via smart_open)\n                yield from self.get_texts_from_file_handler(f)\n\n    def get_texts(self):\n        \"\"\"Iterate over all input files and read JSON from each line.\"\"\"\n        # if self.workers == 1:\n        yield from self.get_texts_with_single_proc()\n        # else:\n        #     yield from self.get_texts_with_multi_proc()\n\n    def get_texts_with_multi_proc(self):\n        \"\"\"Iterate over all input files in parallel and read JSON from each line.\"\"\"\n        raise NotImplementedError()\n        # # with multiprocessing.Pool(self.workers) as pool:\n        # with multiprocess.Pool(self.workers) as pool:\n        #     for text in flatmap(pool, self.get_texts_from_file_path, self.get_raw_jsonl_paths()):\n        #         yield text\n\n        # print(\"all files done\")\n\n    def get_texts_with_single_proc(self):\n        \"\"\"Iterate over all input files and read JSON from each line.\"\"\"\n        processed_files = 0\n        for file_path in self.get_raw_jsonl_paths():\n            yield from self.get_texts_from_file_path(file_path)\n\n            processed_files += 1\n\n        if processed_files == 0:\n            logger.warning(\"No file has been processed.\")\n</code></pre>"},{"location":"api/jsonl_dataset/#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_document_from_item","title":"<code>get_document_from_item(item)</code>","text":"<p>This simply returns the document with a text field from item (but dataset classes can override this to implement filtering etc.)</p> Source code in <code>src/llm_datasets/datasets/jsonl_dataset.py</code> <pre><code>def get_document_from_item(self, item) -&gt; Document:\n    \"\"\"This simply returns the document with a text field from item (but dataset classes can override this to implement filtering etc.)\"\"\"\n    return Document(text=item[self.raw_jsonl_text_field])\n</code></pre>"},{"location":"api/jsonl_dataset/#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_text_from_item","title":"<code>get_text_from_item(item)</code>","text":"<p>This simply returns the text field from item (but dataset classes can override this to implement filtering etc.)</p> Source code in <code>src/llm_datasets/datasets/jsonl_dataset.py</code> <pre><code>def get_text_from_item(self, item) -&gt; str:\n    \"\"\"This simply returns the text field from item (but dataset classes can override this to implement filtering etc.)\"\"\"\n    return item[self.raw_jsonl_text_field]\n</code></pre>"},{"location":"api/jsonl_dataset/#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_texts","title":"<code>get_texts()</code>","text":"<p>Iterate over all input files and read JSON from each line.</p> Source code in <code>src/llm_datasets/datasets/jsonl_dataset.py</code> <pre><code>def get_texts(self):\n    \"\"\"Iterate over all input files and read JSON from each line.\"\"\"\n    # if self.workers == 1:\n    yield from self.get_texts_with_single_proc()\n</code></pre>"},{"location":"api/jsonl_dataset/#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_texts_with_multi_proc","title":"<code>get_texts_with_multi_proc()</code>","text":"<p>Iterate over all input files in parallel and read JSON from each line.</p> Source code in <code>src/llm_datasets/datasets/jsonl_dataset.py</code> <pre><code>def get_texts_with_multi_proc(self):\n    \"\"\"Iterate over all input files in parallel and read JSON from each line.\"\"\"\n    raise NotImplementedError()\n</code></pre>"},{"location":"api/jsonl_dataset/#llm_datasets.datasets.jsonl_dataset.JSONLDataset.get_texts_with_single_proc","title":"<code>get_texts_with_single_proc()</code>","text":"<p>Iterate over all input files and read JSON from each line.</p> Source code in <code>src/llm_datasets/datasets/jsonl_dataset.py</code> <pre><code>def get_texts_with_single_proc(self):\n    \"\"\"Iterate over all input files and read JSON from each line.\"\"\"\n    processed_files = 0\n    for file_path in self.get_raw_jsonl_paths():\n        yield from self.get_texts_from_file_path(file_path)\n\n        processed_files += 1\n\n    if processed_files == 0:\n        logger.warning(\"No file has been processed.\")\n</code></pre>"},{"location":"datasets/","title":"Available datasets","text":"<p># Datasets</p> <p>The framework provides 2241 datasets from 62 sources in 164 languages. The languages are as follows: Afrikaans, Amharic, Aragonese, Arabic, Arz, Assamese, Ast, Avaric, Azerbaijani, Azb, Bashkir, Belarusian, Bulgarian, Bihari, Bengali, Tibetan, Bpy, Breton, Bosnian, Bxr, Catalan, Chechen, Ceb, Ckb, Code, Czech, Chuvash, Welsh, Danish, German, Dsb, Dhivehi, Greek, English, Esperanto, Spanish, Estonian, Basque, Persian, Finnish, French, Western Frisian, Irish, Gaelic, Galician, Guaran\u00ed, Gom, Gsw, Gujarati, Hausa, Hebrew, Hindi, Croatian, Hsb, Haitian, Hungarian, Armenian, Interlingua, Indonesian, Interlingue, Igbo, Ilo, Ido, Icelandic, Italian, Japanese, Jbo, Javanese, Georgian, Kazakh, Khmer, Kannada, Korean, Krc, Kurdish, Komi, Cornish, Kirghiz, Latin, Luxembourgish, Lez, Limburgish, Lmo, Lao, Lithuanian, Latvian, Mai, Malagasy, Mhr, Min, Macedonian, Malayalam, Mongolian, Marathi, Mrj, Malay, Maltese, Multi, Mwl, Burmese, Mzn, Nah, Nds, Nepali, New, Dutch, Norwegian Nynorsk, Norwegian, Chichewa, Occitan, Oromo, Oriya, Ossetian, Panjabi, Polish, Pms, Pnb, Pashto, Portuguese, Quechua, Romanian, Russian, Kinyarwanda, Sanskrit, Sah, Sindhi, Serbo-Croatian, Sinhalese, Slovak, Slovene, Shona, Somali, Albanian, Serbian, Southern Sotho, Sundanese, Swedish, Swahili, Tamil, Telugu, Tajik, Thai, Tigrinya, Turkmen, Tagalog, Turkish, Tatar, Uighur, Ukrainian, Urdu, Uzbek, Vietnamese, Volap\u00fck, Walloon, War, Wuu, X-Eml, Xal, Xhosa, Xmf, Yiddish, Yoruba, Chinese, Zu</p>"},{"location":"datasets/#languages","title":"Languages","text":"language reported_tokens af N/A am N/A an N/A ar N/A arz N/A as N/A ast N/A av N/A az N/A azb N/A ba N/A be N/A bg 13 B bh N/A bn N/A bo N/A bpy N/A br N/A bs N/A bxr N/A ca 4 B ce N/A ceb N/A ckb N/A code 250 B cs 21 B cv N/A cy N/A da 11 B de 26 B dsb N/A dv N/A el 24 B en 117 B eo N/A es 20 B et 5 B eu 982 M fa N/A fi 9 B fr 60 B fy N/A ga 669 M gd N/A gl 36 M gn N/A gom N/A gsw N/A gu N/A ha N/A he N/A hi N/A hr 8 B hsb N/A ht N/A hu 12 B hy N/A ia N/A id N/A ie N/A ig N/A ilo N/A io N/A is N/A it 14 B ja N/A jbo N/A jv N/A ka N/A kk N/A km N/A kn N/A ko N/A krc N/A ku N/A kv N/A kw N/A ky N/A la N/A lb N/A lez N/A li N/A lmo N/A lo N/A lt 5 B lv 4 B mai N/A mg N/A mhr N/A min N/A mk N/A ml N/A mn N/A mr N/A mrj N/A ms N/A mt 4 B multi N/A mwl N/A my N/A mzn N/A nah N/A nds N/A ne N/A new N/A nl 26 B nn 301 M no 5 B ny N/A oc N/A om N/A or N/A os N/A pa N/A pl 25 B pms N/A pnb N/A ps N/A pt 24 B qu N/A ro 9 B ru N/A rw N/A sa N/A sah N/A sd N/A sh 58 k si N/A sk 18 B sl 9 B sn N/A so N/A sq N/A sr 3 B st N/A su N/A sv 13 B sw N/A ta N/A te N/A tg N/A th N/A ti N/A tk N/A tl N/A tr N/A tt N/A ug N/A uk 11 B ur N/A uz N/A vi N/A vo N/A wa N/A war N/A wuu N/A x-eml N/A xal N/A xh N/A xmf N/A yi N/A yo N/A zh N/A zu N/A"},{"location":"datasets/#data-sources","title":"Data sources","text":"source_id reported_tokens curlicat 410 M macocu 23 B redpajama 46 B wura N/A wikihow 2 M pes2o 42 B proof_pile 8 B pile_of_law N/A math_amps N/A edgarcorpus 7 B bulgarian_news 283 M bulnc 567 M openlegaldata 10 B dewac 2 B ga_bilingual_legistation 4 M ga_universal_dependencies 3 M hrwac 1 B styria_news 409 M croatian_news_engri 695 M itwac 2 B korpus_malti 366 M sonar 500 M cc_gigafida 127 M academic_slovene_kas 1 B slwac_web 1 B sk_court_decisions 11 B sk_laws 45 M syn_v9 5 B cs_en_parallel N/A danish_gigaword 1 B danewsroom 472 M dk_clarin 441 M cabernet 712 M norwegian_cc 5 B pl_nkjp 1 M pl_parliamentary_corpus 671 M parlamento_pt 819 M brwac 3 B seimas_lt_en 48 k state_related_latvian_web 1 M greek_legal_code 45 M greek_web_corpus 3 B estonian_reference_corpus 175 M enc2021 N/A ekspress N/A euscrawl 846 M spanish_legal 3 B ylenews N/A sv_gigaword 1 B srpkor N/A marcell_legislative_subcorpus_v2 31 M uk_laws 579 M eurlex 121 B legal_mc4 29 B wiki 12 B wikibooks 353 M wikiquote 268 M wikinews 79 M wikisource 2 B wikivoyage 132 M colossal_oscar 154 B starcoder 250 B <p>This page is automatically generated.</p>"},{"location":"datasets/language_af/","title":"Afrikaans Datasets","text":"<p>There are in total 13 datasets with N/A tokens in Afrikaans language.</p>"},{"location":"datasets/language_af/#colossal-oscar-1-af-2015-14","title":"Colossal OSCAR 1 [af; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_af</code> Title: Colossal OSCAR 1 [af; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_af/#colossal-oscar-1-af-2016-40","title":"Colossal OSCAR 1 [af; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_af</code> Title: Colossal OSCAR 1 [af; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_af/#colossal-oscar-1-af-2017-43","title":"Colossal OSCAR 1 [af; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_af</code> Title: Colossal OSCAR 1 [af; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_af/#colossal-oscar-1-af-2018-47","title":"Colossal OSCAR 1 [af; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_af</code> Title: Colossal OSCAR 1 [af; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_af/#colossal-oscar-1-af-2019-22","title":"Colossal OSCAR 1 [af; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_af</code> Title: Colossal OSCAR 1 [af; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_af/#colossal-oscar-1-af-2020-24","title":"Colossal OSCAR 1 [af; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_af</code> Title: Colossal OSCAR 1 [af; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_af/#colossal-oscar-1-af-2020-45","title":"Colossal OSCAR 1 [af; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_af</code> Title: Colossal OSCAR 1 [af; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_af/#colossal-oscar-1-af-2021-49","title":"Colossal OSCAR 1 [af; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_af</code> Title: Colossal OSCAR 1 [af; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_af/#colossal-oscar-1-af-2022-27","title":"Colossal OSCAR 1 [af; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_af</code> Title: Colossal OSCAR 1 [af; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_af/#colossal-oscar-1-af-2022-49","title":"Colossal OSCAR 1 [af; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_af</code> Title: Colossal OSCAR 1 [af; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_af/#colossal-oscar-1-af-2023-14","title":"Colossal OSCAR 1 [af; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_af</code> Title: Colossal OSCAR 1 [af; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_af/#colossal-oscar-1-af-2023-23","title":"Colossal OSCAR 1 [af; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_af</code> Title: Colossal OSCAR 1 [af; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_af/#wura-afrikaans","title":"WURA [Afrikaans]","text":"Dataset ID: <code>wura_af</code> Title: WURA [Afrikaans] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_am/","title":"Amharic Datasets","text":"<p>There are in total 13 datasets with N/A tokens in Amharic language.</p>"},{"location":"datasets/language_am/#colossal-oscar-1-am-2015-14","title":"Colossal OSCAR 1 [am; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_am</code> Title: Colossal OSCAR 1 [am; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_am/#colossal-oscar-1-am-2016-40","title":"Colossal OSCAR 1 [am; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_am</code> Title: Colossal OSCAR 1 [am; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_am/#colossal-oscar-1-am-2017-43","title":"Colossal OSCAR 1 [am; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_am</code> Title: Colossal OSCAR 1 [am; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_am/#colossal-oscar-1-am-2018-47","title":"Colossal OSCAR 1 [am; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_am</code> Title: Colossal OSCAR 1 [am; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_am/#colossal-oscar-1-am-2019-22","title":"Colossal OSCAR 1 [am; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_am</code> Title: Colossal OSCAR 1 [am; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_am/#colossal-oscar-1-am-2020-24","title":"Colossal OSCAR 1 [am; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_am</code> Title: Colossal OSCAR 1 [am; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_am/#colossal-oscar-1-am-2020-45","title":"Colossal OSCAR 1 [am; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_am</code> Title: Colossal OSCAR 1 [am; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_am/#colossal-oscar-1-am-2021-49","title":"Colossal OSCAR 1 [am; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_am</code> Title: Colossal OSCAR 1 [am; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_am/#colossal-oscar-1-am-2022-27","title":"Colossal OSCAR 1 [am; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_am</code> Title: Colossal OSCAR 1 [am; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_am/#colossal-oscar-1-am-2022-49","title":"Colossal OSCAR 1 [am; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_am</code> Title: Colossal OSCAR 1 [am; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_am/#colossal-oscar-1-am-2023-14","title":"Colossal OSCAR 1 [am; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_am</code> Title: Colossal OSCAR 1 [am; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_am/#colossal-oscar-1-am-2023-23","title":"Colossal OSCAR 1 [am; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_am</code> Title: Colossal OSCAR 1 [am; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_am/#wura-amharic","title":"WURA [Amharic]","text":"Dataset ID: <code>wura_am</code> Title: WURA [Amharic] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_an/","title":"Aragonese Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Aragonese language.</p>"},{"location":"datasets/language_an/#colossal-oscar-1-an-2015-14","title":"Colossal OSCAR 1 [an; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_an</code> Title: Colossal OSCAR 1 [an; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_an/#colossal-oscar-1-an-2016-40","title":"Colossal OSCAR 1 [an; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_an</code> Title: Colossal OSCAR 1 [an; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_an/#colossal-oscar-1-an-2017-43","title":"Colossal OSCAR 1 [an; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_an</code> Title: Colossal OSCAR 1 [an; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_an/#colossal-oscar-1-an-2018-47","title":"Colossal OSCAR 1 [an; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_an</code> Title: Colossal OSCAR 1 [an; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_an/#colossal-oscar-1-an-2019-22","title":"Colossal OSCAR 1 [an; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_an</code> Title: Colossal OSCAR 1 [an; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_an/#colossal-oscar-1-an-2020-24","title":"Colossal OSCAR 1 [an; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_an</code> Title: Colossal OSCAR 1 [an; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_an/#colossal-oscar-1-an-2020-45","title":"Colossal OSCAR 1 [an; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_an</code> Title: Colossal OSCAR 1 [an; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_an/#colossal-oscar-1-an-2021-49","title":"Colossal OSCAR 1 [an; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_an</code> Title: Colossal OSCAR 1 [an; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_an/#colossal-oscar-1-an-2022-27","title":"Colossal OSCAR 1 [an; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_an</code> Title: Colossal OSCAR 1 [an; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_an/#colossal-oscar-1-an-2022-49","title":"Colossal OSCAR 1 [an; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_an</code> Title: Colossal OSCAR 1 [an; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_an/#colossal-oscar-1-an-2023-14","title":"Colossal OSCAR 1 [an; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_an</code> Title: Colossal OSCAR 1 [an; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_an/#colossal-oscar-1-an-2023-23","title":"Colossal OSCAR 1 [an; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_an</code> Title: Colossal OSCAR 1 [an; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ar/","title":"Arabic Datasets","text":"<p>There are in total 13 datasets with N/A tokens in Arabic language.</p>"},{"location":"datasets/language_ar/#colossal-oscar-1-ar-2015-14","title":"Colossal OSCAR 1 [ar; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ar</code> Title: Colossal OSCAR 1 [ar; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ar/#colossal-oscar-1-ar-2016-40","title":"Colossal OSCAR 1 [ar; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ar</code> Title: Colossal OSCAR 1 [ar; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ar/#colossal-oscar-1-ar-2017-43","title":"Colossal OSCAR 1 [ar; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ar</code> Title: Colossal OSCAR 1 [ar; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ar/#colossal-oscar-1-ar-2018-47","title":"Colossal OSCAR 1 [ar; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ar</code> Title: Colossal OSCAR 1 [ar; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ar/#colossal-oscar-1-ar-2019-22","title":"Colossal OSCAR 1 [ar; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ar</code> Title: Colossal OSCAR 1 [ar; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ar/#colossal-oscar-1-ar-2020-24","title":"Colossal OSCAR 1 [ar; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ar</code> Title: Colossal OSCAR 1 [ar; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ar/#colossal-oscar-1-ar-2020-45","title":"Colossal OSCAR 1 [ar; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ar</code> Title: Colossal OSCAR 1 [ar; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ar/#colossal-oscar-1-ar-2021-49","title":"Colossal OSCAR 1 [ar; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ar</code> Title: Colossal OSCAR 1 [ar; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ar/#colossal-oscar-1-ar-2022-27","title":"Colossal OSCAR 1 [ar; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ar</code> Title: Colossal OSCAR 1 [ar; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ar/#colossal-oscar-1-ar-2022-49","title":"Colossal OSCAR 1 [ar; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ar</code> Title: Colossal OSCAR 1 [ar; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ar/#colossal-oscar-1-ar-2023-14","title":"Colossal OSCAR 1 [ar; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ar</code> Title: Colossal OSCAR 1 [ar; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ar/#colossal-oscar-1-ar-2023-23","title":"Colossal OSCAR 1 [ar; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ar</code> Title: Colossal OSCAR 1 [ar; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ar/#wura-egyptian-arabic","title":"WURA [Egyptian Arabic]","text":"Dataset ID: <code>wura_arz</code> Title: WURA [Egyptian Arabic] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_arz/","title":"Arz Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Arz language.</p>"},{"location":"datasets/language_arz/#colossal-oscar-1-arz-2015-14","title":"Colossal OSCAR 1 [arz; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_arz</code> Title: Colossal OSCAR 1 [arz; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_arz/#colossal-oscar-1-arz-2016-40","title":"Colossal OSCAR 1 [arz; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_arz</code> Title: Colossal OSCAR 1 [arz; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_arz/#colossal-oscar-1-arz-2017-43","title":"Colossal OSCAR 1 [arz; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_arz</code> Title: Colossal OSCAR 1 [arz; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_arz/#colossal-oscar-1-arz-2018-47","title":"Colossal OSCAR 1 [arz; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_arz</code> Title: Colossal OSCAR 1 [arz; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_arz/#colossal-oscar-1-arz-2019-22","title":"Colossal OSCAR 1 [arz; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_arz</code> Title: Colossal OSCAR 1 [arz; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_arz/#colossal-oscar-1-arz-2020-24","title":"Colossal OSCAR 1 [arz; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_arz</code> Title: Colossal OSCAR 1 [arz; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_arz/#colossal-oscar-1-arz-2020-45","title":"Colossal OSCAR 1 [arz; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_arz</code> Title: Colossal OSCAR 1 [arz; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_arz/#colossal-oscar-1-arz-2021-49","title":"Colossal OSCAR 1 [arz; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_arz</code> Title: Colossal OSCAR 1 [arz; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_arz/#colossal-oscar-1-arz-2022-27","title":"Colossal OSCAR 1 [arz; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_arz</code> Title: Colossal OSCAR 1 [arz; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_arz/#colossal-oscar-1-arz-2022-49","title":"Colossal OSCAR 1 [arz; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_arz</code> Title: Colossal OSCAR 1 [arz; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_arz/#colossal-oscar-1-arz-2023-14","title":"Colossal OSCAR 1 [arz; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_arz</code> Title: Colossal OSCAR 1 [arz; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_arz/#colossal-oscar-1-arz-2023-23","title":"Colossal OSCAR 1 [arz; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_arz</code> Title: Colossal OSCAR 1 [arz; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_as/","title":"Assamese Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Assamese language.</p>"},{"location":"datasets/language_as/#colossal-oscar-1-as-2015-14","title":"Colossal OSCAR 1 [as; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_as</code> Title: Colossal OSCAR 1 [as; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_as/#colossal-oscar-1-as-2016-40","title":"Colossal OSCAR 1 [as; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_as</code> Title: Colossal OSCAR 1 [as; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_as/#colossal-oscar-1-as-2017-43","title":"Colossal OSCAR 1 [as; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_as</code> Title: Colossal OSCAR 1 [as; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_as/#colossal-oscar-1-as-2018-47","title":"Colossal OSCAR 1 [as; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_as</code> Title: Colossal OSCAR 1 [as; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_as/#colossal-oscar-1-as-2019-22","title":"Colossal OSCAR 1 [as; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_as</code> Title: Colossal OSCAR 1 [as; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_as/#colossal-oscar-1-as-2020-24","title":"Colossal OSCAR 1 [as; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_as</code> Title: Colossal OSCAR 1 [as; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_as/#colossal-oscar-1-as-2020-45","title":"Colossal OSCAR 1 [as; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_as</code> Title: Colossal OSCAR 1 [as; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_as/#colossal-oscar-1-as-2021-49","title":"Colossal OSCAR 1 [as; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_as</code> Title: Colossal OSCAR 1 [as; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_as/#colossal-oscar-1-as-2022-27","title":"Colossal OSCAR 1 [as; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_as</code> Title: Colossal OSCAR 1 [as; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_as/#colossal-oscar-1-as-2022-49","title":"Colossal OSCAR 1 [as; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_as</code> Title: Colossal OSCAR 1 [as; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_as/#colossal-oscar-1-as-2023-14","title":"Colossal OSCAR 1 [as; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_as</code> Title: Colossal OSCAR 1 [as; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_as/#colossal-oscar-1-as-2023-23","title":"Colossal OSCAR 1 [as; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_as</code> Title: Colossal OSCAR 1 [as; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ast/","title":"Ast Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Ast language.</p>"},{"location":"datasets/language_ast/#colossal-oscar-1-ast-2015-14","title":"Colossal OSCAR 1 [ast; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ast</code> Title: Colossal OSCAR 1 [ast; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ast/#colossal-oscar-1-ast-2016-40","title":"Colossal OSCAR 1 [ast; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ast</code> Title: Colossal OSCAR 1 [ast; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ast/#colossal-oscar-1-ast-2017-43","title":"Colossal OSCAR 1 [ast; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ast</code> Title: Colossal OSCAR 1 [ast; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ast/#colossal-oscar-1-ast-2018-47","title":"Colossal OSCAR 1 [ast; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ast</code> Title: Colossal OSCAR 1 [ast; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ast/#colossal-oscar-1-ast-2019-22","title":"Colossal OSCAR 1 [ast; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ast</code> Title: Colossal OSCAR 1 [ast; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ast/#colossal-oscar-1-ast-2020-24","title":"Colossal OSCAR 1 [ast; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ast</code> Title: Colossal OSCAR 1 [ast; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ast/#colossal-oscar-1-ast-2020-45","title":"Colossal OSCAR 1 [ast; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ast</code> Title: Colossal OSCAR 1 [ast; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ast/#colossal-oscar-1-ast-2021-49","title":"Colossal OSCAR 1 [ast; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ast</code> Title: Colossal OSCAR 1 [ast; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ast/#colossal-oscar-1-ast-2022-27","title":"Colossal OSCAR 1 [ast; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ast</code> Title: Colossal OSCAR 1 [ast; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ast/#colossal-oscar-1-ast-2022-49","title":"Colossal OSCAR 1 [ast; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ast</code> Title: Colossal OSCAR 1 [ast; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ast/#colossal-oscar-1-ast-2023-14","title":"Colossal OSCAR 1 [ast; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ast</code> Title: Colossal OSCAR 1 [ast; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ast/#colossal-oscar-1-ast-2023-23","title":"Colossal OSCAR 1 [ast; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ast</code> Title: Colossal OSCAR 1 [ast; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_av/","title":"Avaric Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Avaric language.</p>"},{"location":"datasets/language_av/#colossal-oscar-1-av-2015-14","title":"Colossal OSCAR 1 [av; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_av</code> Title: Colossal OSCAR 1 [av; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_av/#colossal-oscar-1-av-2016-40","title":"Colossal OSCAR 1 [av; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_av</code> Title: Colossal OSCAR 1 [av; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_av/#colossal-oscar-1-av-2017-43","title":"Colossal OSCAR 1 [av; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_av</code> Title: Colossal OSCAR 1 [av; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_av/#colossal-oscar-1-av-2018-47","title":"Colossal OSCAR 1 [av; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_av</code> Title: Colossal OSCAR 1 [av; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_av/#colossal-oscar-1-av-2019-22","title":"Colossal OSCAR 1 [av; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_av</code> Title: Colossal OSCAR 1 [av; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_av/#colossal-oscar-1-av-2020-24","title":"Colossal OSCAR 1 [av; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_av</code> Title: Colossal OSCAR 1 [av; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_av/#colossal-oscar-1-av-2020-45","title":"Colossal OSCAR 1 [av; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_av</code> Title: Colossal OSCAR 1 [av; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_av/#colossal-oscar-1-av-2021-49","title":"Colossal OSCAR 1 [av; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_av</code> Title: Colossal OSCAR 1 [av; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_av/#colossal-oscar-1-av-2022-27","title":"Colossal OSCAR 1 [av; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_av</code> Title: Colossal OSCAR 1 [av; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_av/#colossal-oscar-1-av-2022-49","title":"Colossal OSCAR 1 [av; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_av</code> Title: Colossal OSCAR 1 [av; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_av/#colossal-oscar-1-av-2023-14","title":"Colossal OSCAR 1 [av; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_av</code> Title: Colossal OSCAR 1 [av; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_av/#colossal-oscar-1-av-2023-23","title":"Colossal OSCAR 1 [av; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_av</code> Title: Colossal OSCAR 1 [av; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_az/","title":"Azerbaijani Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Azerbaijani language.</p>"},{"location":"datasets/language_az/#colossal-oscar-1-az-2015-14","title":"Colossal OSCAR 1 [az; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_az</code> Title: Colossal OSCAR 1 [az; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_az/#colossal-oscar-1-az-2016-40","title":"Colossal OSCAR 1 [az; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_az</code> Title: Colossal OSCAR 1 [az; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_az/#colossal-oscar-1-az-2017-43","title":"Colossal OSCAR 1 [az; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_az</code> Title: Colossal OSCAR 1 [az; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_az/#colossal-oscar-1-az-2018-47","title":"Colossal OSCAR 1 [az; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_az</code> Title: Colossal OSCAR 1 [az; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_az/#colossal-oscar-1-az-2019-22","title":"Colossal OSCAR 1 [az; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_az</code> Title: Colossal OSCAR 1 [az; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_az/#colossal-oscar-1-az-2020-24","title":"Colossal OSCAR 1 [az; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_az</code> Title: Colossal OSCAR 1 [az; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_az/#colossal-oscar-1-az-2020-45","title":"Colossal OSCAR 1 [az; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_az</code> Title: Colossal OSCAR 1 [az; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_az/#colossal-oscar-1-az-2021-49","title":"Colossal OSCAR 1 [az; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_az</code> Title: Colossal OSCAR 1 [az; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_az/#colossal-oscar-1-az-2022-27","title":"Colossal OSCAR 1 [az; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_az</code> Title: Colossal OSCAR 1 [az; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_az/#colossal-oscar-1-az-2022-49","title":"Colossal OSCAR 1 [az; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_az</code> Title: Colossal OSCAR 1 [az; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_az/#colossal-oscar-1-az-2023-14","title":"Colossal OSCAR 1 [az; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_az</code> Title: Colossal OSCAR 1 [az; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_az/#colossal-oscar-1-az-2023-23","title":"Colossal OSCAR 1 [az; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_az</code> Title: Colossal OSCAR 1 [az; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_azb/","title":"Azb Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Azb language.</p>"},{"location":"datasets/language_azb/#colossal-oscar-1-azb-2015-14","title":"Colossal OSCAR 1 [azb; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_azb</code> Title: Colossal OSCAR 1 [azb; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_azb/#colossal-oscar-1-azb-2016-40","title":"Colossal OSCAR 1 [azb; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_azb</code> Title: Colossal OSCAR 1 [azb; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_azb/#colossal-oscar-1-azb-2017-43","title":"Colossal OSCAR 1 [azb; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_azb</code> Title: Colossal OSCAR 1 [azb; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_azb/#colossal-oscar-1-azb-2018-47","title":"Colossal OSCAR 1 [azb; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_azb</code> Title: Colossal OSCAR 1 [azb; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_azb/#colossal-oscar-1-azb-2019-22","title":"Colossal OSCAR 1 [azb; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_azb</code> Title: Colossal OSCAR 1 [azb; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_azb/#colossal-oscar-1-azb-2020-24","title":"Colossal OSCAR 1 [azb; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_azb</code> Title: Colossal OSCAR 1 [azb; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_azb/#colossal-oscar-1-azb-2020-45","title":"Colossal OSCAR 1 [azb; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_azb</code> Title: Colossal OSCAR 1 [azb; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_azb/#colossal-oscar-1-azb-2021-49","title":"Colossal OSCAR 1 [azb; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_azb</code> Title: Colossal OSCAR 1 [azb; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_azb/#colossal-oscar-1-azb-2022-27","title":"Colossal OSCAR 1 [azb; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_azb</code> Title: Colossal OSCAR 1 [azb; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_azb/#colossal-oscar-1-azb-2022-49","title":"Colossal OSCAR 1 [azb; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_azb</code> Title: Colossal OSCAR 1 [azb; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_azb/#colossal-oscar-1-azb-2023-14","title":"Colossal OSCAR 1 [azb; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_azb</code> Title: Colossal OSCAR 1 [azb; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_azb/#colossal-oscar-1-azb-2023-23","title":"Colossal OSCAR 1 [azb; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_azb</code> Title: Colossal OSCAR 1 [azb; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ba/","title":"Bashkir Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Bashkir language.</p>"},{"location":"datasets/language_ba/#colossal-oscar-1-ba-2015-14","title":"Colossal OSCAR 1 [ba; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ba</code> Title: Colossal OSCAR 1 [ba; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ba/#colossal-oscar-1-ba-2016-40","title":"Colossal OSCAR 1 [ba; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ba</code> Title: Colossal OSCAR 1 [ba; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ba/#colossal-oscar-1-ba-2017-43","title":"Colossal OSCAR 1 [ba; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ba</code> Title: Colossal OSCAR 1 [ba; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ba/#colossal-oscar-1-ba-2018-47","title":"Colossal OSCAR 1 [ba; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ba</code> Title: Colossal OSCAR 1 [ba; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ba/#colossal-oscar-1-ba-2019-22","title":"Colossal OSCAR 1 [ba; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ba</code> Title: Colossal OSCAR 1 [ba; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ba/#colossal-oscar-1-ba-2020-24","title":"Colossal OSCAR 1 [ba; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ba</code> Title: Colossal OSCAR 1 [ba; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ba/#colossal-oscar-1-ba-2020-45","title":"Colossal OSCAR 1 [ba; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ba</code> Title: Colossal OSCAR 1 [ba; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ba/#colossal-oscar-1-ba-2021-49","title":"Colossal OSCAR 1 [ba; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ba</code> Title: Colossal OSCAR 1 [ba; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ba/#colossal-oscar-1-ba-2022-27","title":"Colossal OSCAR 1 [ba; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ba</code> Title: Colossal OSCAR 1 [ba; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ba/#colossal-oscar-1-ba-2022-49","title":"Colossal OSCAR 1 [ba; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ba</code> Title: Colossal OSCAR 1 [ba; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ba/#colossal-oscar-1-ba-2023-14","title":"Colossal OSCAR 1 [ba; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ba</code> Title: Colossal OSCAR 1 [ba; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ba/#colossal-oscar-1-ba-2023-23","title":"Colossal OSCAR 1 [ba; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ba</code> Title: Colossal OSCAR 1 [ba; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_be/","title":"Belarusian Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Belarusian language.</p>"},{"location":"datasets/language_be/#colossal-oscar-1-be-2015-14","title":"Colossal OSCAR 1 [be; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_be</code> Title: Colossal OSCAR 1 [be; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_be/#colossal-oscar-1-be-2016-40","title":"Colossal OSCAR 1 [be; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_be</code> Title: Colossal OSCAR 1 [be; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_be/#colossal-oscar-1-be-2017-43","title":"Colossal OSCAR 1 [be; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_be</code> Title: Colossal OSCAR 1 [be; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_be/#colossal-oscar-1-be-2018-47","title":"Colossal OSCAR 1 [be; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_be</code> Title: Colossal OSCAR 1 [be; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_be/#colossal-oscar-1-be-2019-22","title":"Colossal OSCAR 1 [be; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_be</code> Title: Colossal OSCAR 1 [be; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_be/#colossal-oscar-1-be-2020-24","title":"Colossal OSCAR 1 [be; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_be</code> Title: Colossal OSCAR 1 [be; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_be/#colossal-oscar-1-be-2020-45","title":"Colossal OSCAR 1 [be; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_be</code> Title: Colossal OSCAR 1 [be; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_be/#colossal-oscar-1-be-2021-49","title":"Colossal OSCAR 1 [be; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_be</code> Title: Colossal OSCAR 1 [be; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_be/#colossal-oscar-1-be-2022-27","title":"Colossal OSCAR 1 [be; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_be</code> Title: Colossal OSCAR 1 [be; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_be/#colossal-oscar-1-be-2022-49","title":"Colossal OSCAR 1 [be; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_be</code> Title: Colossal OSCAR 1 [be; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_be/#colossal-oscar-1-be-2023-14","title":"Colossal OSCAR 1 [be; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_be</code> Title: Colossal OSCAR 1 [be; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_be/#colossal-oscar-1-be-2023-23","title":"Colossal OSCAR 1 [be; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_be</code> Title: Colossal OSCAR 1 [be; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_bg/","title":"Bulgarian Datasets","text":"<p>There are in total 23 datasets with 13 B tokens in Bulgarian language.</p>"},{"location":"datasets/language_bg/#bulgarian-national-corpus","title":"Bulgarian National Corpus","text":"Dataset ID: <code>bulnc</code> Title: Bulgarian National Corpus Description: The Bulgarian National Corpus contains a wide range of texts in various sizes, media types (written and spoken), styles, periods (synchronic and diachronic), and licenses. Each text in the collection is supplied with metadata. The Bulgarian National Corpus  was first compiled using the Bulgarian Lexicographic Archive and the Text Archive of Written Bulgarian, which account for 55.95% of the corpus. Later, the EMEA corpus (medical administrative texts) and the OpenSubtitles corpus (film subtitles) were added, accounting for 1.27% and 8.61% of the BulNC, respectively. The remaining texts were crawled automatically and include a large number of administrative texts, news from monolingual and multilingual sources, scientific texts, and popular science. The BulNC is not fully downloadable due to the inclusion of copyrighted material. We've provided a link to a password-protected archive for evaluation. Availibility: <code>on_request</code> Homepage: [None] License: research only (commercial use: None, sharealike: False) Tokens: 567 M"},{"location":"datasets/language_bg/#curlicat-corpus-bulgarian","title":"CURLICAT Corpus [Bulgarian]","text":"Dataset ID: <code>curlicat_bg</code> Title: CURLICAT Corpus [Bulgarian] Description: The CURLICAT corpus includes 7 monolingual corpora (Bulgarian, Croatian, Hungarian, Polish, Romanian, Slovak and Slovenian) containing selected samples from respective national corpora. Availibility: <code>direct_download</code> Homepage: [https://elrc-share.eu/repository/browse/curlicat-bulgarian-corpus/fed6af2a590311ed9c1a00155d0267062ed273d01d2343f1b78d08d4d481679d/] License: CC-BY-SA-4.0 (commercial use: None, sharealike: True) Tokens: 35 M"},{"location":"datasets/language_bg/#colossal-oscar-1-bg-2015-14","title":"Colossal OSCAR 1 [bg; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_bg</code> Title: Colossal OSCAR 1 [bg; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bg/#colossal-oscar-1-bg-2016-40","title":"Colossal OSCAR 1 [bg; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_bg</code> Title: Colossal OSCAR 1 [bg; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bg/#colossal-oscar-1-bg-2017-43","title":"Colossal OSCAR 1 [bg; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_bg</code> Title: Colossal OSCAR 1 [bg; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bg/#colossal-oscar-1-bg-2018-47","title":"Colossal OSCAR 1 [bg; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_bg</code> Title: Colossal OSCAR 1 [bg; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bg/#colossal-oscar-1-bg-2019-22","title":"Colossal OSCAR 1 [bg; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_bg</code> Title: Colossal OSCAR 1 [bg; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bg/#colossal-oscar-1-bg-2020-24","title":"Colossal OSCAR 1 [bg; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_bg</code> Title: Colossal OSCAR 1 [bg; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bg/#colossal-oscar-1-bg-2020-45","title":"Colossal OSCAR 1 [bg; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_bg</code> Title: Colossal OSCAR 1 [bg; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bg/#colossal-oscar-1-bg-2021-49","title":"Colossal OSCAR 1 [bg; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_bg</code> Title: Colossal OSCAR 1 [bg; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bg/#colossal-oscar-1-bg-2022-27","title":"Colossal OSCAR 1 [bg; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_bg</code> Title: Colossal OSCAR 1 [bg; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bg/#colossal-oscar-1-bg-2022-49","title":"Colossal OSCAR 1 [bg; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_bg</code> Title: Colossal OSCAR 1 [bg; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bg/#colossal-oscar-1-bg-2023-14","title":"Colossal OSCAR 1 [bg; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_bg</code> Title: Colossal OSCAR 1 [bg; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bg/#colossal-oscar-1-bg-2023-23","title":"Colossal OSCAR 1 [bg; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_bg</code> Title: Colossal OSCAR 1 [bg; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 4 B"},{"location":"datasets/language_bg/#crawl-of-bulgarian-news-websites","title":"Crawl of Bulgarian news websites","text":"Dataset ID: <code>bulgarian_news</code> Title: Crawl of Bulgarian news websites Description: The collection was collected by crawling Bulgarian websites in Bulgarian. Text samples are in json format. We can provide raw tests. Availibility: <code>on_request</code> Homepage: [None] License: research only (commercial use: None, sharealike: None) Tokens: 283 M"},{"location":"datasets/language_bg/#eurlexresources-bg","title":"EurlexResources [bg]","text":"Dataset ID: <code>eurlex_bg</code> Title: EurlexResources [bg] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 4 B"},{"location":"datasets/language_bg/#legalmc4-bg","title":"LegalMC4 [bg]","text":"Dataset ID: <code>legal_mc4_bg</code> Title: LegalMC4 [bg] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 2 M"},{"location":"datasets/language_bg/#macocu-web-corpus-bulgarian-20","title":"MaCoCu web corpus [Bulgarian 2.0]","text":"Dataset ID: <code>macocu_bg</code> Title: MaCoCu web corpus [Bulgarian 2.0] Description: MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/ Availibility: <code>direct_download</code> Homepage: [https://www.clarin.si/repository/xmlui/handle/11356/1800] License: CC0-No Rights Reserved (commercial use: True, sharealike: False) Tokens: 4 B"},{"location":"datasets/language_bg/#wikibooks-bg","title":"Wikibooks [bg]","text":"Dataset ID: <code>wikibooks_bg</code> Title: Wikibooks [bg] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 3 M"},{"location":"datasets/language_bg/#wikinews-bg","title":"Wikinews [bg]","text":"Dataset ID: <code>wikinews_bg</code> Title: Wikinews [bg] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 1 M"},{"location":"datasets/language_bg/#wikipedia-bg","title":"Wikipedia [bg]","text":"Dataset ID: <code>wiki_bg</code> Title: Wikipedia [bg] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 356 M"},{"location":"datasets/language_bg/#wikiquote-bg","title":"Wikiquote [bg]","text":"Dataset ID: <code>wikiquote_bg</code> Title: Wikiquote [bg] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 11 M"},{"location":"datasets/language_bg/#wikisource-bg","title":"Wikisource [bg]","text":"Dataset ID: <code>wikisource_bg</code> Title: Wikisource [bg] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 25 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_bh/","title":"Bihari Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Bihari language.</p>"},{"location":"datasets/language_bh/#colossal-oscar-1-bh-2015-14","title":"Colossal OSCAR 1 [bh; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_bh</code> Title: Colossal OSCAR 1 [bh; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bh/#colossal-oscar-1-bh-2016-40","title":"Colossal OSCAR 1 [bh; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_bh</code> Title: Colossal OSCAR 1 [bh; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bh/#colossal-oscar-1-bh-2017-43","title":"Colossal OSCAR 1 [bh; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_bh</code> Title: Colossal OSCAR 1 [bh; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bh/#colossal-oscar-1-bh-2018-47","title":"Colossal OSCAR 1 [bh; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_bh</code> Title: Colossal OSCAR 1 [bh; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bh/#colossal-oscar-1-bh-2019-22","title":"Colossal OSCAR 1 [bh; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_bh</code> Title: Colossal OSCAR 1 [bh; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bh/#colossal-oscar-1-bh-2020-24","title":"Colossal OSCAR 1 [bh; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_bh</code> Title: Colossal OSCAR 1 [bh; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bh/#colossal-oscar-1-bh-2020-45","title":"Colossal OSCAR 1 [bh; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_bh</code> Title: Colossal OSCAR 1 [bh; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bh/#colossal-oscar-1-bh-2021-49","title":"Colossal OSCAR 1 [bh; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_bh</code> Title: Colossal OSCAR 1 [bh; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bh/#colossal-oscar-1-bh-2022-27","title":"Colossal OSCAR 1 [bh; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_bh</code> Title: Colossal OSCAR 1 [bh; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bh/#colossal-oscar-1-bh-2022-49","title":"Colossal OSCAR 1 [bh; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_bh</code> Title: Colossal OSCAR 1 [bh; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bh/#colossal-oscar-1-bh-2023-14","title":"Colossal OSCAR 1 [bh; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_bh</code> Title: Colossal OSCAR 1 [bh; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bh/#colossal-oscar-1-bh-2023-23","title":"Colossal OSCAR 1 [bh; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_bh</code> Title: Colossal OSCAR 1 [bh; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_bn/","title":"Bengali Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Bengali language.</p>"},{"location":"datasets/language_bn/#colossal-oscar-1-bn-2015-14","title":"Colossal OSCAR 1 [bn; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_bn</code> Title: Colossal OSCAR 1 [bn; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bn/#colossal-oscar-1-bn-2016-40","title":"Colossal OSCAR 1 [bn; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_bn</code> Title: Colossal OSCAR 1 [bn; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bn/#colossal-oscar-1-bn-2017-43","title":"Colossal OSCAR 1 [bn; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_bn</code> Title: Colossal OSCAR 1 [bn; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bn/#colossal-oscar-1-bn-2018-47","title":"Colossal OSCAR 1 [bn; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_bn</code> Title: Colossal OSCAR 1 [bn; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bn/#colossal-oscar-1-bn-2019-22","title":"Colossal OSCAR 1 [bn; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_bn</code> Title: Colossal OSCAR 1 [bn; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bn/#colossal-oscar-1-bn-2020-24","title":"Colossal OSCAR 1 [bn; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_bn</code> Title: Colossal OSCAR 1 [bn; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bn/#colossal-oscar-1-bn-2020-45","title":"Colossal OSCAR 1 [bn; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_bn</code> Title: Colossal OSCAR 1 [bn; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bn/#colossal-oscar-1-bn-2021-49","title":"Colossal OSCAR 1 [bn; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_bn</code> Title: Colossal OSCAR 1 [bn; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bn/#colossal-oscar-1-bn-2022-27","title":"Colossal OSCAR 1 [bn; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_bn</code> Title: Colossal OSCAR 1 [bn; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bn/#colossal-oscar-1-bn-2022-49","title":"Colossal OSCAR 1 [bn; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_bn</code> Title: Colossal OSCAR 1 [bn; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bn/#colossal-oscar-1-bn-2023-14","title":"Colossal OSCAR 1 [bn; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_bn</code> Title: Colossal OSCAR 1 [bn; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bn/#colossal-oscar-1-bn-2023-23","title":"Colossal OSCAR 1 [bn; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_bn</code> Title: Colossal OSCAR 1 [bn; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_bo/","title":"Tibetan Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Tibetan language.</p>"},{"location":"datasets/language_bo/#colossal-oscar-1-bo-2015-14","title":"Colossal OSCAR 1 [bo; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_bo</code> Title: Colossal OSCAR 1 [bo; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bo/#colossal-oscar-1-bo-2016-40","title":"Colossal OSCAR 1 [bo; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_bo</code> Title: Colossal OSCAR 1 [bo; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bo/#colossal-oscar-1-bo-2017-43","title":"Colossal OSCAR 1 [bo; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_bo</code> Title: Colossal OSCAR 1 [bo; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bo/#colossal-oscar-1-bo-2018-47","title":"Colossal OSCAR 1 [bo; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_bo</code> Title: Colossal OSCAR 1 [bo; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bo/#colossal-oscar-1-bo-2019-22","title":"Colossal OSCAR 1 [bo; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_bo</code> Title: Colossal OSCAR 1 [bo; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bo/#colossal-oscar-1-bo-2020-24","title":"Colossal OSCAR 1 [bo; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_bo</code> Title: Colossal OSCAR 1 [bo; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bo/#colossal-oscar-1-bo-2020-45","title":"Colossal OSCAR 1 [bo; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_bo</code> Title: Colossal OSCAR 1 [bo; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bo/#colossal-oscar-1-bo-2021-49","title":"Colossal OSCAR 1 [bo; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_bo</code> Title: Colossal OSCAR 1 [bo; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bo/#colossal-oscar-1-bo-2022-27","title":"Colossal OSCAR 1 [bo; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_bo</code> Title: Colossal OSCAR 1 [bo; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bo/#colossal-oscar-1-bo-2022-49","title":"Colossal OSCAR 1 [bo; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_bo</code> Title: Colossal OSCAR 1 [bo; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bo/#colossal-oscar-1-bo-2023-14","title":"Colossal OSCAR 1 [bo; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_bo</code> Title: Colossal OSCAR 1 [bo; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bo/#colossal-oscar-1-bo-2023-23","title":"Colossal OSCAR 1 [bo; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_bo</code> Title: Colossal OSCAR 1 [bo; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_bpy/","title":"Bpy Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Bpy language.</p>"},{"location":"datasets/language_bpy/#colossal-oscar-1-bpy-2015-14","title":"Colossal OSCAR 1 [bpy; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_bpy</code> Title: Colossal OSCAR 1 [bpy; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bpy/#colossal-oscar-1-bpy-2016-40","title":"Colossal OSCAR 1 [bpy; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_bpy</code> Title: Colossal OSCAR 1 [bpy; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bpy/#colossal-oscar-1-bpy-2017-43","title":"Colossal OSCAR 1 [bpy; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_bpy</code> Title: Colossal OSCAR 1 [bpy; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bpy/#colossal-oscar-1-bpy-2018-47","title":"Colossal OSCAR 1 [bpy; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_bpy</code> Title: Colossal OSCAR 1 [bpy; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bpy/#colossal-oscar-1-bpy-2019-22","title":"Colossal OSCAR 1 [bpy; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_bpy</code> Title: Colossal OSCAR 1 [bpy; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bpy/#colossal-oscar-1-bpy-2020-24","title":"Colossal OSCAR 1 [bpy; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_bpy</code> Title: Colossal OSCAR 1 [bpy; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bpy/#colossal-oscar-1-bpy-2020-45","title":"Colossal OSCAR 1 [bpy; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_bpy</code> Title: Colossal OSCAR 1 [bpy; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bpy/#colossal-oscar-1-bpy-2021-49","title":"Colossal OSCAR 1 [bpy; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_bpy</code> Title: Colossal OSCAR 1 [bpy; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bpy/#colossal-oscar-1-bpy-2022-27","title":"Colossal OSCAR 1 [bpy; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_bpy</code> Title: Colossal OSCAR 1 [bpy; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bpy/#colossal-oscar-1-bpy-2022-49","title":"Colossal OSCAR 1 [bpy; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_bpy</code> Title: Colossal OSCAR 1 [bpy; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bpy/#colossal-oscar-1-bpy-2023-14","title":"Colossal OSCAR 1 [bpy; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_bpy</code> Title: Colossal OSCAR 1 [bpy; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bpy/#colossal-oscar-1-bpy-2023-23","title":"Colossal OSCAR 1 [bpy; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_bpy</code> Title: Colossal OSCAR 1 [bpy; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_br/","title":"Breton Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Breton language.</p>"},{"location":"datasets/language_br/#colossal-oscar-1-br-2015-14","title":"Colossal OSCAR 1 [br; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_br</code> Title: Colossal OSCAR 1 [br; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_br/#colossal-oscar-1-br-2016-40","title":"Colossal OSCAR 1 [br; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_br</code> Title: Colossal OSCAR 1 [br; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_br/#colossal-oscar-1-br-2017-43","title":"Colossal OSCAR 1 [br; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_br</code> Title: Colossal OSCAR 1 [br; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_br/#colossal-oscar-1-br-2018-47","title":"Colossal OSCAR 1 [br; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_br</code> Title: Colossal OSCAR 1 [br; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_br/#colossal-oscar-1-br-2019-22","title":"Colossal OSCAR 1 [br; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_br</code> Title: Colossal OSCAR 1 [br; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_br/#colossal-oscar-1-br-2020-24","title":"Colossal OSCAR 1 [br; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_br</code> Title: Colossal OSCAR 1 [br; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_br/#colossal-oscar-1-br-2020-45","title":"Colossal OSCAR 1 [br; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_br</code> Title: Colossal OSCAR 1 [br; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_br/#colossal-oscar-1-br-2021-49","title":"Colossal OSCAR 1 [br; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_br</code> Title: Colossal OSCAR 1 [br; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_br/#colossal-oscar-1-br-2022-27","title":"Colossal OSCAR 1 [br; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_br</code> Title: Colossal OSCAR 1 [br; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_br/#colossal-oscar-1-br-2022-49","title":"Colossal OSCAR 1 [br; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_br</code> Title: Colossal OSCAR 1 [br; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_br/#colossal-oscar-1-br-2023-14","title":"Colossal OSCAR 1 [br; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_br</code> Title: Colossal OSCAR 1 [br; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_br/#colossal-oscar-1-br-2023-23","title":"Colossal OSCAR 1 [br; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_br</code> Title: Colossal OSCAR 1 [br; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_bs/","title":"Bosnian Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Bosnian language.</p>"},{"location":"datasets/language_bs/#colossal-oscar-1-bs-2015-14","title":"Colossal OSCAR 1 [bs; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_bs</code> Title: Colossal OSCAR 1 [bs; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bs/#colossal-oscar-1-bs-2016-40","title":"Colossal OSCAR 1 [bs; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_bs</code> Title: Colossal OSCAR 1 [bs; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bs/#colossal-oscar-1-bs-2017-43","title":"Colossal OSCAR 1 [bs; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_bs</code> Title: Colossal OSCAR 1 [bs; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bs/#colossal-oscar-1-bs-2018-47","title":"Colossal OSCAR 1 [bs; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_bs</code> Title: Colossal OSCAR 1 [bs; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bs/#colossal-oscar-1-bs-2019-22","title":"Colossal OSCAR 1 [bs; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_bs</code> Title: Colossal OSCAR 1 [bs; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bs/#colossal-oscar-1-bs-2020-24","title":"Colossal OSCAR 1 [bs; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_bs</code> Title: Colossal OSCAR 1 [bs; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bs/#colossal-oscar-1-bs-2020-45","title":"Colossal OSCAR 1 [bs; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_bs</code> Title: Colossal OSCAR 1 [bs; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bs/#colossal-oscar-1-bs-2021-49","title":"Colossal OSCAR 1 [bs; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_bs</code> Title: Colossal OSCAR 1 [bs; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bs/#colossal-oscar-1-bs-2022-27","title":"Colossal OSCAR 1 [bs; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_bs</code> Title: Colossal OSCAR 1 [bs; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bs/#colossal-oscar-1-bs-2022-49","title":"Colossal OSCAR 1 [bs; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_bs</code> Title: Colossal OSCAR 1 [bs; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bs/#colossal-oscar-1-bs-2023-14","title":"Colossal OSCAR 1 [bs; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_bs</code> Title: Colossal OSCAR 1 [bs; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bs/#colossal-oscar-1-bs-2023-23","title":"Colossal OSCAR 1 [bs; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_bs</code> Title: Colossal OSCAR 1 [bs; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_bxr/","title":"Bxr Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Bxr language.</p>"},{"location":"datasets/language_bxr/#colossal-oscar-1-bxr-2015-14","title":"Colossal OSCAR 1 [bxr; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_bxr</code> Title: Colossal OSCAR 1 [bxr; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bxr/#colossal-oscar-1-bxr-2016-40","title":"Colossal OSCAR 1 [bxr; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_bxr</code> Title: Colossal OSCAR 1 [bxr; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bxr/#colossal-oscar-1-bxr-2017-43","title":"Colossal OSCAR 1 [bxr; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_bxr</code> Title: Colossal OSCAR 1 [bxr; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bxr/#colossal-oscar-1-bxr-2018-47","title":"Colossal OSCAR 1 [bxr; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_bxr</code> Title: Colossal OSCAR 1 [bxr; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bxr/#colossal-oscar-1-bxr-2019-22","title":"Colossal OSCAR 1 [bxr; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_bxr</code> Title: Colossal OSCAR 1 [bxr; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bxr/#colossal-oscar-1-bxr-2020-24","title":"Colossal OSCAR 1 [bxr; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_bxr</code> Title: Colossal OSCAR 1 [bxr; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bxr/#colossal-oscar-1-bxr-2020-45","title":"Colossal OSCAR 1 [bxr; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_bxr</code> Title: Colossal OSCAR 1 [bxr; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bxr/#colossal-oscar-1-bxr-2021-49","title":"Colossal OSCAR 1 [bxr; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_bxr</code> Title: Colossal OSCAR 1 [bxr; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bxr/#colossal-oscar-1-bxr-2022-27","title":"Colossal OSCAR 1 [bxr; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_bxr</code> Title: Colossal OSCAR 1 [bxr; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bxr/#colossal-oscar-1-bxr-2022-49","title":"Colossal OSCAR 1 [bxr; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_bxr</code> Title: Colossal OSCAR 1 [bxr; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bxr/#colossal-oscar-1-bxr-2023-14","title":"Colossal OSCAR 1 [bxr; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_bxr</code> Title: Colossal OSCAR 1 [bxr; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_bxr/#colossal-oscar-1-bxr-2023-23","title":"Colossal OSCAR 1 [bxr; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_bxr</code> Title: Colossal OSCAR 1 [bxr; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ca/","title":"Catalan Datasets","text":"<p>There are in total 19 datasets with 4 B tokens in Catalan language.</p>"},{"location":"datasets/language_ca/#colossal-oscar-1-ca-2015-14","title":"Colossal OSCAR 1 [ca; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ca</code> Title: Colossal OSCAR 1 [ca; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#colossal-oscar-1-ca-2016-40","title":"Colossal OSCAR 1 [ca; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ca</code> Title: Colossal OSCAR 1 [ca; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#colossal-oscar-1-ca-2017-43","title":"Colossal OSCAR 1 [ca; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ca</code> Title: Colossal OSCAR 1 [ca; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#colossal-oscar-1-ca-2018-47","title":"Colossal OSCAR 1 [ca; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ca</code> Title: Colossal OSCAR 1 [ca; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#colossal-oscar-1-ca-2019-22","title":"Colossal OSCAR 1 [ca; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ca</code> Title: Colossal OSCAR 1 [ca; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#colossal-oscar-1-ca-2020-24","title":"Colossal OSCAR 1 [ca; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ca</code> Title: Colossal OSCAR 1 [ca; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#colossal-oscar-1-ca-2020-45","title":"Colossal OSCAR 1 [ca; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ca</code> Title: Colossal OSCAR 1 [ca; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#colossal-oscar-1-ca-2021-49","title":"Colossal OSCAR 1 [ca; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ca</code> Title: Colossal OSCAR 1 [ca; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#colossal-oscar-1-ca-2022-27","title":"Colossal OSCAR 1 [ca; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ca</code> Title: Colossal OSCAR 1 [ca; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#colossal-oscar-1-ca-2022-49","title":"Colossal OSCAR 1 [ca; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ca</code> Title: Colossal OSCAR 1 [ca; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#colossal-oscar-1-ca-2023-14","title":"Colossal OSCAR 1 [ca; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ca</code> Title: Colossal OSCAR 1 [ca; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#colossal-oscar-1-ca-2023-23","title":"Colossal OSCAR 1 [ca; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ca</code> Title: Colossal OSCAR 1 [ca; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 2 B"},{"location":"datasets/language_ca/#macocu-web-corpus-catalan-10","title":"MaCoCu web corpus [Catalan 1.0]","text":"Dataset ID: <code>macocu_ca</code> Title: MaCoCu web corpus [Catalan 1.0] Description: MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/ Availibility: <code>direct_download</code> Homepage: [https://www.clarin.si/repository/xmlui/handle/11356/1837] License: CC0-No Rights Reserved (commercial use: True, sharealike: False) Tokens: 2 B"},{"location":"datasets/language_ca/#wikibooks-ca","title":"Wikibooks [ca]","text":"Dataset ID: <code>wikibooks_ca</code> Title: Wikibooks [ca] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#wikinews-ca","title":"Wikinews [ca]","text":"Dataset ID: <code>wikinews_ca</code> Title: Wikinews [ca] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#wikipedia-ca","title":"Wikipedia [ca]","text":"Dataset ID: <code>wiki_ca</code> Title: Wikipedia [ca] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#wikiquote-ca","title":"Wikiquote [ca]","text":"Dataset ID: <code>wikiquote_ca</code> Title: Wikiquote [ca] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#wikisource-ca","title":"Wikisource [ca]","text":"Dataset ID: <code>wikisource_ca</code> Title: Wikisource [ca] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ca/#wikivoyage-ca","title":"Wikivoyage [ca]","text":"Dataset ID: <code>wikivoyage_ca</code> Title: Wikivoyage [ca] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ce/","title":"Chechen Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Chechen language.</p>"},{"location":"datasets/language_ce/#colossal-oscar-1-ce-2015-14","title":"Colossal OSCAR 1 [ce; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ce</code> Title: Colossal OSCAR 1 [ce; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ce/#colossal-oscar-1-ce-2016-40","title":"Colossal OSCAR 1 [ce; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ce</code> Title: Colossal OSCAR 1 [ce; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ce/#colossal-oscar-1-ce-2017-43","title":"Colossal OSCAR 1 [ce; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ce</code> Title: Colossal OSCAR 1 [ce; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ce/#colossal-oscar-1-ce-2018-47","title":"Colossal OSCAR 1 [ce; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ce</code> Title: Colossal OSCAR 1 [ce; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ce/#colossal-oscar-1-ce-2019-22","title":"Colossal OSCAR 1 [ce; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ce</code> Title: Colossal OSCAR 1 [ce; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ce/#colossal-oscar-1-ce-2020-24","title":"Colossal OSCAR 1 [ce; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ce</code> Title: Colossal OSCAR 1 [ce; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ce/#colossal-oscar-1-ce-2020-45","title":"Colossal OSCAR 1 [ce; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ce</code> Title: Colossal OSCAR 1 [ce; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ce/#colossal-oscar-1-ce-2021-49","title":"Colossal OSCAR 1 [ce; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ce</code> Title: Colossal OSCAR 1 [ce; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ce/#colossal-oscar-1-ce-2022-27","title":"Colossal OSCAR 1 [ce; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ce</code> Title: Colossal OSCAR 1 [ce; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ce/#colossal-oscar-1-ce-2022-49","title":"Colossal OSCAR 1 [ce; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ce</code> Title: Colossal OSCAR 1 [ce; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ce/#colossal-oscar-1-ce-2023-14","title":"Colossal OSCAR 1 [ce; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ce</code> Title: Colossal OSCAR 1 [ce; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ce/#colossal-oscar-1-ce-2023-23","title":"Colossal OSCAR 1 [ce; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ce</code> Title: Colossal OSCAR 1 [ce; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ceb/","title":"Ceb Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Ceb language.</p>"},{"location":"datasets/language_ceb/#colossal-oscar-1-ceb-2015-14","title":"Colossal OSCAR 1 [ceb; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ceb</code> Title: Colossal OSCAR 1 [ceb; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ceb/#colossal-oscar-1-ceb-2016-40","title":"Colossal OSCAR 1 [ceb; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ceb</code> Title: Colossal OSCAR 1 [ceb; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ceb/#colossal-oscar-1-ceb-2017-43","title":"Colossal OSCAR 1 [ceb; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ceb</code> Title: Colossal OSCAR 1 [ceb; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ceb/#colossal-oscar-1-ceb-2018-47","title":"Colossal OSCAR 1 [ceb; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ceb</code> Title: Colossal OSCAR 1 [ceb; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ceb/#colossal-oscar-1-ceb-2019-22","title":"Colossal OSCAR 1 [ceb; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ceb</code> Title: Colossal OSCAR 1 [ceb; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ceb/#colossal-oscar-1-ceb-2020-24","title":"Colossal OSCAR 1 [ceb; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ceb</code> Title: Colossal OSCAR 1 [ceb; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ceb/#colossal-oscar-1-ceb-2020-45","title":"Colossal OSCAR 1 [ceb; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ceb</code> Title: Colossal OSCAR 1 [ceb; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ceb/#colossal-oscar-1-ceb-2021-49","title":"Colossal OSCAR 1 [ceb; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ceb</code> Title: Colossal OSCAR 1 [ceb; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ceb/#colossal-oscar-1-ceb-2022-27","title":"Colossal OSCAR 1 [ceb; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ceb</code> Title: Colossal OSCAR 1 [ceb; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ceb/#colossal-oscar-1-ceb-2022-49","title":"Colossal OSCAR 1 [ceb; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ceb</code> Title: Colossal OSCAR 1 [ceb; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ceb/#colossal-oscar-1-ceb-2023-14","title":"Colossal OSCAR 1 [ceb; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ceb</code> Title: Colossal OSCAR 1 [ceb; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ceb/#colossal-oscar-1-ceb-2023-23","title":"Colossal OSCAR 1 [ceb; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ceb</code> Title: Colossal OSCAR 1 [ceb; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ckb/","title":"Ckb Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Ckb language.</p>"},{"location":"datasets/language_ckb/#colossal-oscar-1-ckb-2015-14","title":"Colossal OSCAR 1 [ckb; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ckb</code> Title: Colossal OSCAR 1 [ckb; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ckb/#colossal-oscar-1-ckb-2016-40","title":"Colossal OSCAR 1 [ckb; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ckb</code> Title: Colossal OSCAR 1 [ckb; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ckb/#colossal-oscar-1-ckb-2017-43","title":"Colossal OSCAR 1 [ckb; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ckb</code> Title: Colossal OSCAR 1 [ckb; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ckb/#colossal-oscar-1-ckb-2018-47","title":"Colossal OSCAR 1 [ckb; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ckb</code> Title: Colossal OSCAR 1 [ckb; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ckb/#colossal-oscar-1-ckb-2019-22","title":"Colossal OSCAR 1 [ckb; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ckb</code> Title: Colossal OSCAR 1 [ckb; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ckb/#colossal-oscar-1-ckb-2020-24","title":"Colossal OSCAR 1 [ckb; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ckb</code> Title: Colossal OSCAR 1 [ckb; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ckb/#colossal-oscar-1-ckb-2020-45","title":"Colossal OSCAR 1 [ckb; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ckb</code> Title: Colossal OSCAR 1 [ckb; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ckb/#colossal-oscar-1-ckb-2021-49","title":"Colossal OSCAR 1 [ckb; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ckb</code> Title: Colossal OSCAR 1 [ckb; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ckb/#colossal-oscar-1-ckb-2022-27","title":"Colossal OSCAR 1 [ckb; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ckb</code> Title: Colossal OSCAR 1 [ckb; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ckb/#colossal-oscar-1-ckb-2022-49","title":"Colossal OSCAR 1 [ckb; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ckb</code> Title: Colossal OSCAR 1 [ckb; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ckb/#colossal-oscar-1-ckb-2023-14","title":"Colossal OSCAR 1 [ckb; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ckb</code> Title: Colossal OSCAR 1 [ckb; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ckb/#colossal-oscar-1-ckb-2023-23","title":"Colossal OSCAR 1 [ckb; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ckb</code> Title: Colossal OSCAR 1 [ckb; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_code/","title":"Code Datasets","text":"<p>There are in total 92 datasets with 250 B tokens in Code language.</p>"},{"location":"datasets/language_code/#starcoder","title":"Starcoder","text":"Dataset ID: <code>starcoder_emacs-lisp</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_1","title":"Starcoder","text":"Dataset ID: <code>starcoder_visual-basic</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_2","title":"Starcoder","text":"Dataset ID: <code>starcoder_racket</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_3","title":"Starcoder","text":"Dataset ID: <code>starcoder_json</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_4","title":"Starcoder","text":"Dataset ID: <code>starcoder_common-lisp</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_5","title":"Starcoder","text":"Dataset ID: <code>starcoder_vhdl</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_6","title":"Starcoder","text":"Dataset ID: <code>starcoder_r</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_7","title":"Starcoder","text":"Dataset ID: <code>starcoder_javascript</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_8","title":"Starcoder","text":"Dataset ID: <code>starcoder_coffeescript</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_9","title":"Starcoder","text":"Dataset ID: <code>starcoder_verilog</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_10","title":"Starcoder","text":"Dataset ID: <code>starcoder_python</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_11","title":"Starcoder","text":"Dataset ID: <code>starcoder_java-server-pages</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_12","title":"Starcoder","text":"Dataset ID: <code>starcoder_cmake</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_13","title":"Starcoder","text":"Dataset ID: <code>starcoder_typescript</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_14","title":"Starcoder","text":"Dataset ID: <code>starcoder_protocol-buffer</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_15","title":"Starcoder","text":"Dataset ID: <code>starcoder_java</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_16","title":"Starcoder","text":"Dataset ID: <code>starcoder_clojure</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_17","title":"Starcoder","text":"Dataset ID: <code>starcoder_thrift</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_18","title":"Starcoder","text":"Dataset ID: <code>starcoder_prolog</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_19","title":"Starcoder","text":"Dataset ID: <code>starcoder_isabelle</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_20","title":"Starcoder","text":"Dataset ID: <code>starcoder_cpp</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_21","title":"Starcoder","text":"Dataset ID: <code>starcoder_c-sharp</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_22","title":"Starcoder","text":"Dataset ID: <code>starcoder_julia</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_23","title":"Starcoder","text":"Dataset ID: <code>starcoder_xslt</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_24","title":"Starcoder","text":"Dataset ID: <code>starcoder_elm</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_25","title":"Starcoder","text":"Dataset ID: <code>starcoder_scala</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_26","title":"Starcoder","text":"Dataset ID: <code>starcoder_literate-agda</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_27","title":"Starcoder","text":"Dataset ID: <code>starcoder_elixir</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_28","title":"Starcoder","text":"Dataset ID: <code>starcoder_sas</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_29","title":"Starcoder","text":"Dataset ID: <code>starcoder_lean</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_30","title":"Starcoder","text":"Dataset ID: <code>starcoder_dockerfile</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_31","title":"Starcoder","text":"Dataset ID: <code>starcoder_zig</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_32","title":"Starcoder","text":"Dataset ID: <code>starcoder_rust</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_33","title":"Starcoder","text":"Dataset ID: <code>starcoder_kotlin</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_34","title":"Starcoder","text":"Dataset ID: <code>starcoder_dart</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_35","title":"Starcoder","text":"Dataset ID: <code>starcoder_yaml</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_36","title":"Starcoder","text":"Dataset ID: <code>starcoder_ruby</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_37","title":"Starcoder","text":"Dataset ID: <code>starcoder_jupyter-structured-clean-dedup</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_38","title":"Starcoder","text":"Dataset ID: <code>starcoder_cuda</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_39","title":"Starcoder","text":"Dataset ID: <code>starcoder_yacc</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_40","title":"Starcoder","text":"Dataset ID: <code>starcoder_rmarkdown</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_41","title":"Starcoder","text":"Dataset ID: <code>starcoder_jupyter-scripts-dedup-filtered</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_42","title":"Starcoder","text":"Dataset ID: <code>starcoder_css</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_43","title":"Starcoder","text":"Dataset ID: <code>starcoder_restructuredtext</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_44","title":"Starcoder","text":"Dataset ID: <code>starcoder_tex</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_45","title":"Starcoder","text":"Dataset ID: <code>starcoder_powershell</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_46","title":"Starcoder","text":"Dataset ID: <code>starcoder_idris</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_47","title":"Starcoder","text":"Dataset ID: <code>starcoder_applescript</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_48","title":"Starcoder","text":"Dataset ID: <code>starcoder_sql</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_49","title":"Starcoder","text":"Dataset ID: <code>starcoder_markdown</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_50","title":"Starcoder","text":"Dataset ID: <code>starcoder_git-commits-cleaned</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_51","title":"Starcoder","text":"Dataset ID: <code>starcoder_antlr</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_52","title":"Starcoder","text":"Dataset ID: <code>starcoder_sparql</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_53","title":"Starcoder","text":"Dataset ID: <code>starcoder_maple</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_54","title":"Starcoder","text":"Dataset ID: <code>starcoder_fortran</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_55","title":"Starcoder","text":"Dataset ID: <code>starcoder_alloy</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_56","title":"Starcoder","text":"Dataset ID: <code>starcoder_solidity</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_57","title":"Starcoder","text":"Dataset ID: <code>starcoder_makefile</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_58","title":"Starcoder","text":"Dataset ID: <code>starcoder_f-sharp</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_59","title":"Starcoder","text":"Dataset ID: <code>starcoder_agda</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_60","title":"Starcoder","text":"Dataset ID: <code>starcoder_smalltalk</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_61","title":"Starcoder","text":"Dataset ID: <code>starcoder_lua</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_62","title":"Starcoder","text":"Dataset ID: <code>starcoder_erlang</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_63","title":"Starcoder","text":"Dataset ID: <code>starcoder_ada</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_64","title":"Starcoder","text":"Dataset ID: <code>starcoder_shell</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_65","title":"Starcoder","text":"Dataset ID: <code>starcoder_literate-haskell</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_66","title":"Starcoder","text":"Dataset ID: <code>starcoder_github-issues-filtered-structured</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_67","title":"Starcoder","text":"Dataset ID: <code>starcoder_mathematica</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_68","title":"Starcoder","text":"Dataset ID: <code>starcoder_stan</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_69","title":"Starcoder","text":"Dataset ID: <code>starcoder_assembly</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_70","title":"Starcoder","text":"Dataset ID: <code>starcoder_c</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_71","title":"Starcoder","text":"Dataset ID: <code>starcoder_tcsh</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_72","title":"Starcoder","text":"Dataset ID: <code>starcoder_php</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_73","title":"Starcoder","text":"Dataset ID: <code>starcoder_html</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_74","title":"Starcoder","text":"Dataset ID: <code>starcoder_bluespec</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_75","title":"Starcoder","text":"Dataset ID: <code>starcoder_tcl</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_76","title":"Starcoder","text":"Dataset ID: <code>starcoder_perl</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_77","title":"Starcoder","text":"Dataset ID: <code>starcoder_haskell</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_78","title":"Starcoder","text":"Dataset ID: <code>starcoder_batchfile</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_79","title":"Starcoder","text":"Dataset ID: <code>starcoder_literate-coffeescript</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_80","title":"Starcoder","text":"Dataset ID: <code>starcoder_systemverilog</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_81","title":"Starcoder","text":"Dataset ID: <code>starcoder_groovy</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_82","title":"Starcoder","text":"Dataset ID: <code>starcoder_awk</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_83","title":"Starcoder","text":"Dataset ID: <code>starcoder_stata</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_84","title":"Starcoder","text":"Dataset ID: <code>starcoder_ocaml</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_85","title":"Starcoder","text":"Dataset ID: <code>starcoder_go</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_86","title":"Starcoder","text":"Dataset ID: <code>starcoder_augeas</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_87","title":"Starcoder","text":"Dataset ID: <code>starcoder_standard-ml</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_88","title":"Starcoder","text":"Dataset ID: <code>starcoder_matlab</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_89","title":"Starcoder","text":"Dataset ID: <code>starcoder_glsl</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_90","title":"Starcoder","text":"Dataset ID: <code>starcoder_pascal</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_code/#starcoder_91","title":"Starcoder","text":"Dataset ID: <code>starcoder_scheme</code> Title: Starcoder Description: The dataset used for training StarCoder and StarCoderBase. It contains 783GB of code in 86 programming languages, and includes 54GB GitHub Issues + 13GB Jupyter notebooks in scripts and text-code pairs, and 32GB of GitHub commits, which is approximately 250 Billion tokens. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/bigcode/starcoderdata] License: mixed permissive liceses (commercial use: True, sharealike: False) Tokens: 3 B <p>This page is automatically generated.</p>"},{"location":"datasets/language_cs/","title":"Czech Datasets","text":"<p>There are in total 21 datasets with 21 B tokens in Czech language.</p>"},{"location":"datasets/language_cs/#colossal-oscar-1-cs-2015-14","title":"Colossal OSCAR 1 [cs; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_cs</code> Title: Colossal OSCAR 1 [cs; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cs/#colossal-oscar-1-cs-2016-40","title":"Colossal OSCAR 1 [cs; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_cs</code> Title: Colossal OSCAR 1 [cs; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cs/#colossal-oscar-1-cs-2017-43","title":"Colossal OSCAR 1 [cs; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_cs</code> Title: Colossal OSCAR 1 [cs; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cs/#colossal-oscar-1-cs-2018-47","title":"Colossal OSCAR 1 [cs; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_cs</code> Title: Colossal OSCAR 1 [cs; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cs/#colossal-oscar-1-cs-2019-22","title":"Colossal OSCAR 1 [cs; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_cs</code> Title: Colossal OSCAR 1 [cs; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cs/#colossal-oscar-1-cs-2020-24","title":"Colossal OSCAR 1 [cs; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_cs</code> Title: Colossal OSCAR 1 [cs; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cs/#colossal-oscar-1-cs-2020-45","title":"Colossal OSCAR 1 [cs; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_cs</code> Title: Colossal OSCAR 1 [cs; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cs/#colossal-oscar-1-cs-2021-49","title":"Colossal OSCAR 1 [cs; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_cs</code> Title: Colossal OSCAR 1 [cs; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cs/#colossal-oscar-1-cs-2022-27","title":"Colossal OSCAR 1 [cs; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_cs</code> Title: Colossal OSCAR 1 [cs; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cs/#colossal-oscar-1-cs-2022-49","title":"Colossal OSCAR 1 [cs; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_cs</code> Title: Colossal OSCAR 1 [cs; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cs/#colossal-oscar-1-cs-2023-14","title":"Colossal OSCAR 1 [cs; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_cs</code> Title: Colossal OSCAR 1 [cs; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cs/#colossal-oscar-1-cs-2023-23","title":"Colossal OSCAR 1 [cs; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_cs</code> Title: Colossal OSCAR 1 [cs; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 10 B"},{"location":"datasets/language_cs/#czech-english-parallel-corpus-10-czeng-10","title":"Czech-English Parallel Corpus 1.0 (CzEng 1.0)","text":"Dataset ID: <code>cs_en_parallel</code> Title: Czech-English Parallel Corpus 1.0 (CzEng 1.0) Description: CzEng 1.0 is the fourth release of a sentence-parallel Czech-English corpus compiled at the Institute of Formal and Applied Linguistics (\u00daFAL) freely available for non-commercial research purposes. CzEng 1.0 contains 15 million parallel sentences (233 million English and 206 million Czech tokens) from seven different types of sources automatically annotated at surface and deep (a- and t-) layers of syntactic representation. Availibility: <code>direct_download</code> Homepage: [http://hdl.handle.net/11234/1-1458] License: Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0) (commercial use: False, sharealike: True) Tokens: N/A"},{"location":"datasets/language_cs/#eurlexresources-cs","title":"EurlexResources [cs]","text":"Dataset ID: <code>eurlex_cs</code> Title: EurlexResources [cs] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 5 B"},{"location":"datasets/language_cs/#legalmc4-cs","title":"LegalMC4 [cs]","text":"Dataset ID: <code>legal_mc4_cs</code> Title: LegalMC4 [cs] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 2 B"},{"location":"datasets/language_cs/#syn-v9-large-corpus-of-written-czech","title":"SYN v9: large corpus of written Czech","text":"Dataset ID: <code>syn_v9</code> Title: SYN v9: large corpus of written Czech Description: Corpus of contemporary written (printed) Czech sized 4.7 GW (i.e. 5.7 billion tokens). It covers mostly the 1990-2019 period and features rich metadata including detailed bibliographical information, text-type classification etc. SYN v9 contains a wide variety of text types (fiction, non-fiction, newspapers), but the newspapers prevail noticeably. Availibility: <code>signin_download</code> Homepage: [https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-4635] License: Academic Use - Czech National Corpus (Shuffled Corpus Data) (commercial use: False, sharealike: None) Tokens: 5 B"},{"location":"datasets/language_cs/#wikibooks-cs","title":"Wikibooks [cs]","text":"Dataset ID: <code>wikibooks_cs</code> Title: Wikibooks [cs] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 4 M"},{"location":"datasets/language_cs/#wikinews-cs","title":"Wikinews [cs]","text":"Dataset ID: <code>wikinews_cs</code> Title: Wikinews [cs] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 2 M"},{"location":"datasets/language_cs/#wikipedia-cs","title":"Wikipedia [cs]","text":"Dataset ID: <code>wiki_cs</code> Title: Wikipedia [cs] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 273 M"},{"location":"datasets/language_cs/#wikiquote-cs","title":"Wikiquote [cs]","text":"Dataset ID: <code>wikiquote_cs</code> Title: Wikiquote [cs] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 2 M"},{"location":"datasets/language_cs/#wikisource-cs","title":"Wikisource [cs]","text":"Dataset ID: <code>wikisource_cs</code> Title: Wikisource [cs] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 76 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_cv/","title":"Chuvash Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Chuvash language.</p>"},{"location":"datasets/language_cv/#colossal-oscar-1-cv-2015-14","title":"Colossal OSCAR 1 [cv; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_cv</code> Title: Colossal OSCAR 1 [cv; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cv/#colossal-oscar-1-cv-2016-40","title":"Colossal OSCAR 1 [cv; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_cv</code> Title: Colossal OSCAR 1 [cv; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cv/#colossal-oscar-1-cv-2017-43","title":"Colossal OSCAR 1 [cv; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_cv</code> Title: Colossal OSCAR 1 [cv; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cv/#colossal-oscar-1-cv-2018-47","title":"Colossal OSCAR 1 [cv; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_cv</code> Title: Colossal OSCAR 1 [cv; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cv/#colossal-oscar-1-cv-2019-22","title":"Colossal OSCAR 1 [cv; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_cv</code> Title: Colossal OSCAR 1 [cv; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cv/#colossal-oscar-1-cv-2020-24","title":"Colossal OSCAR 1 [cv; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_cv</code> Title: Colossal OSCAR 1 [cv; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cv/#colossal-oscar-1-cv-2020-45","title":"Colossal OSCAR 1 [cv; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_cv</code> Title: Colossal OSCAR 1 [cv; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cv/#colossal-oscar-1-cv-2021-49","title":"Colossal OSCAR 1 [cv; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_cv</code> Title: Colossal OSCAR 1 [cv; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cv/#colossal-oscar-1-cv-2022-27","title":"Colossal OSCAR 1 [cv; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_cv</code> Title: Colossal OSCAR 1 [cv; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cv/#colossal-oscar-1-cv-2022-49","title":"Colossal OSCAR 1 [cv; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_cv</code> Title: Colossal OSCAR 1 [cv; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cv/#colossal-oscar-1-cv-2023-14","title":"Colossal OSCAR 1 [cv; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_cv</code> Title: Colossal OSCAR 1 [cv; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cv/#colossal-oscar-1-cv-2023-23","title":"Colossal OSCAR 1 [cv; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_cv</code> Title: Colossal OSCAR 1 [cv; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_cy/","title":"Welsh Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Welsh language.</p>"},{"location":"datasets/language_cy/#colossal-oscar-1-cy-2015-14","title":"Colossal OSCAR 1 [cy; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_cy</code> Title: Colossal OSCAR 1 [cy; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cy/#colossal-oscar-1-cy-2016-40","title":"Colossal OSCAR 1 [cy; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_cy</code> Title: Colossal OSCAR 1 [cy; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cy/#colossal-oscar-1-cy-2017-43","title":"Colossal OSCAR 1 [cy; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_cy</code> Title: Colossal OSCAR 1 [cy; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cy/#colossal-oscar-1-cy-2018-47","title":"Colossal OSCAR 1 [cy; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_cy</code> Title: Colossal OSCAR 1 [cy; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cy/#colossal-oscar-1-cy-2019-22","title":"Colossal OSCAR 1 [cy; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_cy</code> Title: Colossal OSCAR 1 [cy; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cy/#colossal-oscar-1-cy-2020-24","title":"Colossal OSCAR 1 [cy; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_cy</code> Title: Colossal OSCAR 1 [cy; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cy/#colossal-oscar-1-cy-2020-45","title":"Colossal OSCAR 1 [cy; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_cy</code> Title: Colossal OSCAR 1 [cy; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cy/#colossal-oscar-1-cy-2021-49","title":"Colossal OSCAR 1 [cy; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_cy</code> Title: Colossal OSCAR 1 [cy; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cy/#colossal-oscar-1-cy-2022-27","title":"Colossal OSCAR 1 [cy; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_cy</code> Title: Colossal OSCAR 1 [cy; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cy/#colossal-oscar-1-cy-2022-49","title":"Colossal OSCAR 1 [cy; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_cy</code> Title: Colossal OSCAR 1 [cy; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cy/#colossal-oscar-1-cy-2023-14","title":"Colossal OSCAR 1 [cy; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_cy</code> Title: Colossal OSCAR 1 [cy; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_cy/#colossal-oscar-1-cy-2023-23","title":"Colossal OSCAR 1 [cy; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_cy</code> Title: Colossal OSCAR 1 [cy; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_da/","title":"Danish Datasets","text":"<p>There are in total 21 datasets with 11 B tokens in Danish language.</p>"},{"location":"datasets/language_da/#colossal-oscar-1-da-2015-14","title":"Colossal OSCAR 1 [da; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_da</code> Title: Colossal OSCAR 1 [da; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_da/#colossal-oscar-1-da-2016-40","title":"Colossal OSCAR 1 [da; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_da</code> Title: Colossal OSCAR 1 [da; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_da/#colossal-oscar-1-da-2017-43","title":"Colossal OSCAR 1 [da; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_da</code> Title: Colossal OSCAR 1 [da; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_da/#colossal-oscar-1-da-2018-47","title":"Colossal OSCAR 1 [da; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_da</code> Title: Colossal OSCAR 1 [da; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_da/#colossal-oscar-1-da-2019-22","title":"Colossal OSCAR 1 [da; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_da</code> Title: Colossal OSCAR 1 [da; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_da/#colossal-oscar-1-da-2020-24","title":"Colossal OSCAR 1 [da; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_da</code> Title: Colossal OSCAR 1 [da; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_da/#colossal-oscar-1-da-2020-45","title":"Colossal OSCAR 1 [da; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_da</code> Title: Colossal OSCAR 1 [da; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_da/#colossal-oscar-1-da-2021-49","title":"Colossal OSCAR 1 [da; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_da</code> Title: Colossal OSCAR 1 [da; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_da/#colossal-oscar-1-da-2022-27","title":"Colossal OSCAR 1 [da; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_da</code> Title: Colossal OSCAR 1 [da; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_da/#colossal-oscar-1-da-2022-49","title":"Colossal OSCAR 1 [da; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_da</code> Title: Colossal OSCAR 1 [da; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_da/#colossal-oscar-1-da-2023-14","title":"Colossal OSCAR 1 [da; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_da</code> Title: Colossal OSCAR 1 [da; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_da/#colossal-oscar-1-da-2023-23","title":"Colossal OSCAR 1 [da; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_da</code> Title: Colossal OSCAR 1 [da; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 2 B"},{"location":"datasets/language_da/#dk-clarin-reference-corpus-of-general-danish","title":"DK-CLARIN Reference Corpus of General Danish","text":"Dataset ID: <code>dk_clarin</code> Title: DK-CLARIN Reference Corpus of General Danish Description: Reference Corpus of General Danish Availibility: <code>signin_download</code> Homepage: [https://korpus.dsl.dk/clarin/] License: Academic Use; CLARIN-ACA-NC (commercial use: False, sharealike: False) Tokens: 441 M"},{"location":"datasets/language_da/#danewsroom","title":"DaNewsroom","text":"Dataset ID: <code>danewsroom</code> Title: DaNewsroom Description: A Large-scale Danish Summarisation Dataset Availibility: <code>on_request</code> Homepage: [https://github.com/danielvarab/da-newsroom] License: research-only (unknown license) (commercial use: False, sharealike: None) Tokens: 472 M"},{"location":"datasets/language_da/#danish-gigaword","title":"Danish GigaWord","text":"Dataset ID: <code>danish_gigaword</code> Title: Danish GigaWord Description: A billion-word corpus of Danish text. Split into many sections, and covering many dimensions of variation (spoken/written, formal/informal, modern/old, rigsdansk/dialect, and so on).The license is CC-BY 4.0, Creative Commons with Attribution. Owners: ITU; Leon Derczynski, Manuel R. Ciosici Availibility: <code>direct_download</code> Homepage: [https://sprogteknologi.dk/dataset/danish-gigaword] License: CC-BY 4.0, Creative Commons with Attribution (commercial use: True, sharealike: False) Tokens: 1 B"},{"location":"datasets/language_da/#eurlexresources-da","title":"EurlexResources [da]","text":"Dataset ID: <code>eurlex_da</code> Title: EurlexResources [da] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 7 B"},{"location":"datasets/language_da/#legalmc4-da","title":"LegalMC4 [da]","text":"Dataset ID: <code>legal_mc4_da</code> Title: LegalMC4 [da] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 10 M"},{"location":"datasets/language_da/#wikibooks-da","title":"Wikibooks [da]","text":"Dataset ID: <code>wikibooks_da</code> Title: Wikibooks [da] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 6 M"},{"location":"datasets/language_da/#wikipedia-da","title":"Wikipedia [da]","text":"Dataset ID: <code>wiki_da</code> Title: Wikipedia [da] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 66 M"},{"location":"datasets/language_da/#wikiquote-da","title":"Wikiquote [da]","text":"Dataset ID: <code>wikiquote_da</code> Title: Wikiquote [da] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 303 k"},{"location":"datasets/language_da/#wikisource-da","title":"Wikisource [da]","text":"Dataset ID: <code>wikisource_da</code> Title: Wikisource [da] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 6 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_de/","title":"German Datasets","text":"<p>There are in total 22 datasets with 26 B tokens in German language.</p>"},{"location":"datasets/language_de/#colossal-oscar-1-de-2015-14","title":"Colossal OSCAR 1 [de; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_de</code> Title: Colossal OSCAR 1 [de; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_de/#colossal-oscar-1-de-2016-40","title":"Colossal OSCAR 1 [de; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_de</code> Title: Colossal OSCAR 1 [de; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_de/#colossal-oscar-1-de-2017-43","title":"Colossal OSCAR 1 [de; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_de</code> Title: Colossal OSCAR 1 [de; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_de/#colossal-oscar-1-de-2018-47","title":"Colossal OSCAR 1 [de; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_de</code> Title: Colossal OSCAR 1 [de; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_de/#colossal-oscar-1-de-2019-22","title":"Colossal OSCAR 1 [de; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_de</code> Title: Colossal OSCAR 1 [de; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_de/#colossal-oscar-1-de-2020-24","title":"Colossal OSCAR 1 [de; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_de</code> Title: Colossal OSCAR 1 [de; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_de/#colossal-oscar-1-de-2020-45","title":"Colossal OSCAR 1 [de; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_de</code> Title: Colossal OSCAR 1 [de; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_de/#colossal-oscar-1-de-2021-49","title":"Colossal OSCAR 1 [de; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_de</code> Title: Colossal OSCAR 1 [de; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_de/#colossal-oscar-1-de-2022-27","title":"Colossal OSCAR 1 [de; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_de</code> Title: Colossal OSCAR 1 [de; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_de/#colossal-oscar-1-de-2022-49","title":"Colossal OSCAR 1 [de; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_de</code> Title: Colossal OSCAR 1 [de; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_de/#colossal-oscar-1-de-2023-14","title":"Colossal OSCAR 1 [de; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_de</code> Title: Colossal OSCAR 1 [de; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_de/#colossal-oscar-1-de-2023-23","title":"Colossal OSCAR 1 [de; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_de</code> Title: Colossal OSCAR 1 [de; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_de/#dewac","title":"DeWaC","text":"Dataset ID: <code>dewac</code> Title: DeWaC Description: DeWaC is a 1.7 billion word corpus constructed from the Web limiting the crawl to the .de domain and using medium-frequency words from the SudDeutsche Zeitung corpus and basic German vocabulary lists as seeds. Availibility: <code>on_request</code> Homepage: [https://docs.sslmit.unibo.it/doku.php?id=corpora:dewac] License: unknown license; likely fair-use / research-only (commercial use: None, sharealike: False) Tokens: 2 B"},{"location":"datasets/language_de/#eurlexresources-de","title":"EurlexResources [de]","text":"Dataset ID: <code>eurlex_de</code> Title: EurlexResources [de] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 7 B"},{"location":"datasets/language_de/#legalmc4-de","title":"LegalMC4 [de]","text":"Dataset ID: <code>legal_mc4_de</code> Title: LegalMC4 [de] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 6 B"},{"location":"datasets/language_de/#open-legal-data-german-court-decisions-and-laws","title":"Open Legal Data - German court decisions and laws","text":"Dataset ID: <code>openlegaldata</code> Title: Open Legal Data - German court decisions and laws Description: OPENLEGALDATA.IO is a free and open platform that makes legal documents and information accessible to the public. Availibility: <code>on_request</code> Homepage: [https://openlegaldata.io/] License: public domain (commercial use: True, sharealike: False) Tokens: 10 B"},{"location":"datasets/language_de/#wikibooks-de","title":"Wikibooks [de]","text":"Dataset ID: <code>wikibooks_de</code> Title: Wikibooks [de] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 50 M"},{"location":"datasets/language_de/#wikinews-de","title":"Wikinews [de]","text":"Dataset ID: <code>wikinews_de</code> Title: Wikinews [de] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 9 M"},{"location":"datasets/language_de/#wikipedia-de","title":"Wikipedia [de]","text":"Dataset ID: <code>wiki_de</code> Title: Wikipedia [de] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 2 B"},{"location":"datasets/language_de/#wikiquote-de","title":"Wikiquote [de]","text":"Dataset ID: <code>wikiquote_de</code> Title: Wikiquote [de] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 4 M"},{"location":"datasets/language_de/#wikisource-de","title":"Wikisource [de]","text":"Dataset ID: <code>wikisource_de</code> Title: Wikisource [de] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 156 M"},{"location":"datasets/language_de/#wikivoyage-de","title":"Wikivoyage [de]","text":"Dataset ID: <code>wikivoyage_de</code> Title: Wikivoyage [de] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 29 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_dsb/","title":"Dsb Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Dsb language.</p>"},{"location":"datasets/language_dsb/#colossal-oscar-1-dsb-2015-14","title":"Colossal OSCAR 1 [dsb; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_dsb</code> Title: Colossal OSCAR 1 [dsb; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dsb/#colossal-oscar-1-dsb-2016-40","title":"Colossal OSCAR 1 [dsb; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_dsb</code> Title: Colossal OSCAR 1 [dsb; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dsb/#colossal-oscar-1-dsb-2017-43","title":"Colossal OSCAR 1 [dsb; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_dsb</code> Title: Colossal OSCAR 1 [dsb; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dsb/#colossal-oscar-1-dsb-2018-47","title":"Colossal OSCAR 1 [dsb; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_dsb</code> Title: Colossal OSCAR 1 [dsb; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dsb/#colossal-oscar-1-dsb-2019-22","title":"Colossal OSCAR 1 [dsb; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_dsb</code> Title: Colossal OSCAR 1 [dsb; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dsb/#colossal-oscar-1-dsb-2020-24","title":"Colossal OSCAR 1 [dsb; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_dsb</code> Title: Colossal OSCAR 1 [dsb; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dsb/#colossal-oscar-1-dsb-2020-45","title":"Colossal OSCAR 1 [dsb; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_dsb</code> Title: Colossal OSCAR 1 [dsb; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dsb/#colossal-oscar-1-dsb-2021-49","title":"Colossal OSCAR 1 [dsb; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_dsb</code> Title: Colossal OSCAR 1 [dsb; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dsb/#colossal-oscar-1-dsb-2022-27","title":"Colossal OSCAR 1 [dsb; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_dsb</code> Title: Colossal OSCAR 1 [dsb; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dsb/#colossal-oscar-1-dsb-2022-49","title":"Colossal OSCAR 1 [dsb; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_dsb</code> Title: Colossal OSCAR 1 [dsb; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dsb/#colossal-oscar-1-dsb-2023-14","title":"Colossal OSCAR 1 [dsb; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_dsb</code> Title: Colossal OSCAR 1 [dsb; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dsb/#colossal-oscar-1-dsb-2023-23","title":"Colossal OSCAR 1 [dsb; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_dsb</code> Title: Colossal OSCAR 1 [dsb; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_dv/","title":"Dhivehi Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Dhivehi language.</p>"},{"location":"datasets/language_dv/#colossal-oscar-1-dv-2015-14","title":"Colossal OSCAR 1 [dv; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_dv</code> Title: Colossal OSCAR 1 [dv; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dv/#colossal-oscar-1-dv-2016-40","title":"Colossal OSCAR 1 [dv; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_dv</code> Title: Colossal OSCAR 1 [dv; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dv/#colossal-oscar-1-dv-2017-43","title":"Colossal OSCAR 1 [dv; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_dv</code> Title: Colossal OSCAR 1 [dv; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dv/#colossal-oscar-1-dv-2018-47","title":"Colossal OSCAR 1 [dv; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_dv</code> Title: Colossal OSCAR 1 [dv; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dv/#colossal-oscar-1-dv-2019-22","title":"Colossal OSCAR 1 [dv; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_dv</code> Title: Colossal OSCAR 1 [dv; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dv/#colossal-oscar-1-dv-2020-24","title":"Colossal OSCAR 1 [dv; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_dv</code> Title: Colossal OSCAR 1 [dv; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dv/#colossal-oscar-1-dv-2020-45","title":"Colossal OSCAR 1 [dv; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_dv</code> Title: Colossal OSCAR 1 [dv; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dv/#colossal-oscar-1-dv-2021-49","title":"Colossal OSCAR 1 [dv; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_dv</code> Title: Colossal OSCAR 1 [dv; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dv/#colossal-oscar-1-dv-2022-27","title":"Colossal OSCAR 1 [dv; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_dv</code> Title: Colossal OSCAR 1 [dv; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dv/#colossal-oscar-1-dv-2022-49","title":"Colossal OSCAR 1 [dv; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_dv</code> Title: Colossal OSCAR 1 [dv; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dv/#colossal-oscar-1-dv-2023-14","title":"Colossal OSCAR 1 [dv; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_dv</code> Title: Colossal OSCAR 1 [dv; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_dv/#colossal-oscar-1-dv-2023-23","title":"Colossal OSCAR 1 [dv; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_dv</code> Title: Colossal OSCAR 1 [dv; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_el/","title":"Greek Datasets","text":"<p>There are in total 23 datasets with 24 B tokens in Greek language.</p>"},{"location":"datasets/language_el/#colossal-oscar-1-el-2015-14","title":"Colossal OSCAR 1 [el; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_el</code> Title: Colossal OSCAR 1 [el; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_el/#colossal-oscar-1-el-2016-40","title":"Colossal OSCAR 1 [el; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_el</code> Title: Colossal OSCAR 1 [el; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_el/#colossal-oscar-1-el-2017-43","title":"Colossal OSCAR 1 [el; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_el</code> Title: Colossal OSCAR 1 [el; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_el/#colossal-oscar-1-el-2018-47","title":"Colossal OSCAR 1 [el; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_el</code> Title: Colossal OSCAR 1 [el; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_el/#colossal-oscar-1-el-2019-22","title":"Colossal OSCAR 1 [el; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_el</code> Title: Colossal OSCAR 1 [el; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_el/#colossal-oscar-1-el-2020-24","title":"Colossal OSCAR 1 [el; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_el</code> Title: Colossal OSCAR 1 [el; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_el/#colossal-oscar-1-el-2020-45","title":"Colossal OSCAR 1 [el; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_el</code> Title: Colossal OSCAR 1 [el; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_el/#colossal-oscar-1-el-2021-49","title":"Colossal OSCAR 1 [el; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_el</code> Title: Colossal OSCAR 1 [el; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_el/#colossal-oscar-1-el-2022-27","title":"Colossal OSCAR 1 [el; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_el</code> Title: Colossal OSCAR 1 [el; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_el/#colossal-oscar-1-el-2022-49","title":"Colossal OSCAR 1 [el; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_el</code> Title: Colossal OSCAR 1 [el; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_el/#colossal-oscar-1-el-2023-14","title":"Colossal OSCAR 1 [el; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_el</code> Title: Colossal OSCAR 1 [el; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_el/#colossal-oscar-1-el-2023-23","title":"Colossal OSCAR 1 [el; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_el</code> Title: Colossal OSCAR 1 [el; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 8 B"},{"location":"datasets/language_el/#eurlexresources-el","title":"EurlexResources [el]","text":"Dataset ID: <code>eurlex_el</code> Title: EurlexResources [el] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 7 B"},{"location":"datasets/language_el/#greek-legal-code","title":"Greek Legal Code","text":"Dataset ID: <code>greek_legal_code</code> Title: Greek Legal Code Description: Greek_Legal_Code (GLC) is a dataset consisting of approx. 47k legal resources from Greek legislation. The origin of GLC is \u201cPermanent Greek Legislation Code - Raptarchis\u201d, a collection of Greek legislative  documents classified into multi-level (from broader to more specialized) categories. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/greek_legal_code] License: unknown; likely publlic domain (commercial use: None, sharealike: None) Tokens: 45 M"},{"location":"datasets/language_el/#greek-web-corpus","title":"Greek Web Corpus","text":"Dataset ID: <code>greek_web_corpus</code> Title: Greek Web Corpus Description: A corpus of the Greek Web used for training <code>GreekBART: The First Pretrained Greek Sequence-to-Sequence Model</code> Availibility: <code>on_request</code> Homepage: [http://nlp.polytechnique.fr/resources-greek] License: unknown; likely fair use (commercial use: None, sharealike: None) Tokens: 3 B"},{"location":"datasets/language_el/#legalmc4-el","title":"LegalMC4 [el]","text":"Dataset ID: <code>legal_mc4_el</code> Title: LegalMC4 [el] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 1 M"},{"location":"datasets/language_el/#macocu-web-corpus-greek-10","title":"MaCoCu web corpus [Greek 1.0]","text":"Dataset ID: <code>macocu_el</code> Title: MaCoCu web corpus [Greek 1.0] Description: MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/ Availibility: <code>direct_download</code> Homepage: [https://www.clarin.si/repository/xmlui/handle/11356/1839] License: CC0-No Rights Reserved (commercial use: True, sharealike: False) Tokens: 4 B"},{"location":"datasets/language_el/#wikibooks-el","title":"Wikibooks [el]","text":"Dataset ID: <code>wikibooks_el</code> Title: Wikibooks [el] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 19 M"},{"location":"datasets/language_el/#wikinews-el","title":"Wikinews [el]","text":"Dataset ID: <code>wikinews_el</code> Title: Wikinews [el] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 4 M"},{"location":"datasets/language_el/#wikipedia-el","title":"Wikipedia [el]","text":"Dataset ID: <code>wiki_el</code> Title: Wikipedia [el] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 584 M"},{"location":"datasets/language_el/#wikiquote-el","title":"Wikiquote [el]","text":"Dataset ID: <code>wikiquote_el</code> Title: Wikiquote [el] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 5 M"},{"location":"datasets/language_el/#wikisource-el","title":"Wikisource [el]","text":"Dataset ID: <code>wikisource_el</code> Title: Wikisource [el] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 164 M"},{"location":"datasets/language_el/#wikivoyage-el","title":"Wikivoyage [el]","text":"Dataset ID: <code>wikivoyage_el</code> Title: Wikivoyage [el] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 3 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_en/","title":"English Datasets","text":"<p>There are in total 50 datasets with 117 B tokens in English language.</p>"},{"location":"datasets/language_en/#auxiliary-mathematics-problems-and-solutions-amps-dataset","title":"Auxiliary Mathematics Problems and Solutions (AMPS) dataset","text":"Dataset ID: <code>math_amps</code> Title: Auxiliary Mathematics Problems and Solutions (AMPS) dataset Description: Our pretraining dataset, the Auxiliary Mathematics Problems and Solutions (AMPS) dataset, has problems and step-by-step solutions typeset  in LATEX. AMPS contains over 100,000 problems pulled from Khan Academy and  approximately 5 million problems generated from manually designed Mathematica scripts. Problems include various aspects of algebra, calculus, counting and statistics, geometry, linear algebra, and number theory. Availibility: <code>None</code> Homepage: [https://github.com/hendrycks/math] License: repository license is MIT; no specific data license (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#colossal-oscar-1-en-2015-14","title":"Colossal OSCAR 1 [en; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_en</code> Title: Colossal OSCAR 1 [en; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#colossal-oscar-1-en-2016-40","title":"Colossal OSCAR 1 [en; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_en</code> Title: Colossal OSCAR 1 [en; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#colossal-oscar-1-en-2017-43","title":"Colossal OSCAR 1 [en; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_en</code> Title: Colossal OSCAR 1 [en; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#colossal-oscar-1-en-2018-47","title":"Colossal OSCAR 1 [en; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_en</code> Title: Colossal OSCAR 1 [en; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#colossal-oscar-1-en-2019-22","title":"Colossal OSCAR 1 [en; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_en</code> Title: Colossal OSCAR 1 [en; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#colossal-oscar-1-en-2020-24","title":"Colossal OSCAR 1 [en; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_en</code> Title: Colossal OSCAR 1 [en; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#colossal-oscar-1-en-2020-45","title":"Colossal OSCAR 1 [en; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_en</code> Title: Colossal OSCAR 1 [en; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#colossal-oscar-1-en-2021-49","title":"Colossal OSCAR 1 [en; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_en</code> Title: Colossal OSCAR 1 [en; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#colossal-oscar-1-en-2022-27","title":"Colossal OSCAR 1 [en; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_en</code> Title: Colossal OSCAR 1 [en; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#colossal-oscar-1-en-2022-49","title":"Colossal OSCAR 1 [en; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_en</code> Title: Colossal OSCAR 1 [en; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#colossal-oscar-1-en-2023-14","title":"Colossal OSCAR 1 [en; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_en</code> Title: Colossal OSCAR 1 [en; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#colossal-oscar-1-en-2023-23","title":"Colossal OSCAR 1 [en; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_en</code> Title: Colossal OSCAR 1 [en; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#edgarcorpus","title":"EdgarCorpus","text":"Dataset ID: <code>edgarcorpus</code> Title: EdgarCorpus Description: The dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained. This dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/eloukas/edgar-corpus] License: Apache License Version 2.0 (commercial use: None, sharealike: None) Tokens: 7 B"},{"location":"datasets/language_en/#eurlexresources-en","title":"EurlexResources [en]","text":"Dataset ID: <code>eurlex_en</code> Title: EurlexResources [en] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 8 B"},{"location":"datasets/language_en/#legalmc4-en","title":"LegalMC4 [en]","text":"Dataset ID: <code>legal_mc4_en</code> Title: LegalMC4 [en] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 967 M"},{"location":"datasets/language_en/#pile-of-law-selected-subsets","title":"Pile of Law (selected subsets)","text":"Dataset ID: <code>pile_of_law</code> Title: Pile of Law (selected subsets) Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: CreativeCommons Attribution-NonCommercial-ShareAlike 4.0 International. But individual sources may have other licenses. See paper for details. (commercial use: False, sharealike: True) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-selected-subsets_1","title":"Pile of Law (selected subsets)","text":"Dataset ID: <code>pile_of_law</code> Title: Pile of Law (selected subsets) Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: CreativeCommons Attribution-NonCommercial-ShareAlike 4.0 International. But individual sources may have other licenses. See paper for details. (commercial use: False, sharealike: True) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-acus_reports","title":"Pile of Law [acus_reports]","text":"Dataset ID: <code>pile_of_law_acus_reports</code> Title: Pile of Law [acus_reports] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: None Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-atticus_contracts","title":"Pile of Law [atticus_contracts]","text":"Dataset ID: <code>pile_of_law_atticus_contracts</code> Title: Pile of Law [atticus_contracts] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: CC BY 4.0 (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-cc_casebooks","title":"Pile of Law [cc_casebooks]","text":"Dataset ID: <code>pile_of_law_cc_casebooks</code> Title: Pile of Law [cc_casebooks] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: Mixed; Most restrictive: CC BY-NC-SA 4.0 (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-cfpb_creditcard_contracts","title":"Pile of Law [cfpb_creditcard_contracts]","text":"Dataset ID: <code>pile_of_law_cfpb_creditcard_contracts</code> Title: Pile of Law [cfpb_creditcard_contracts] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: Publicly available, unknown license. Assumed to be governed by fair use standards. (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-congressional_hearings","title":"Pile of Law [congressional_hearings]","text":"Dataset ID: <code>pile_of_law_congressional_hearings</code> Title: Pile of Law [congressional_hearings] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: Public domain (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-constitutions","title":"Pile of Law [constitutions]","text":"Dataset ID: <code>pile_of_law_constitutions</code> Title: Pile of Law [constitutions] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: CC BY-NC 3.0 (commercial use: False, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-courtlistener_docket_entry_documents","title":"Pile of Law [courtlistener_docket_entry_documents]","text":"Dataset ID: <code>pile_of_law_courtlistener_docket_entry_documents</code> Title: Pile of Law [courtlistener_docket_entry_documents] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: Underlying content is Public Domain. (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-courtlistener_opinions","title":"Pile of Law [courtlistener_opinions]","text":"Dataset ID: <code>pile_of_law_courtlistener_opinions</code> Title: Pile of Law [courtlistener_opinions] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: Public domain (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-doj_guidance_documents","title":"Pile of Law [doj_guidance_documents]","text":"Dataset ID: <code>pile_of_law_doj_guidance_documents</code> Title: Pile of Law [doj_guidance_documents] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: None Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-echr","title":"Pile of Law [echr]","text":"Dataset ID: <code>pile_of_law_echr</code> Title: Pile of Law [echr] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: Non-commercial, commercial use requires written permission (commercial use: False, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-ed_policy_guidance","title":"Pile of Law [ed_policy_guidance]","text":"Dataset ID: <code>pile_of_law_ed_policy_guidance</code> Title: Pile of Law [ed_policy_guidance] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: None Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-exam_outlines","title":"Pile of Law [exam_outlines]","text":"Dataset ID: <code>pile_of_law_exam_outlines</code> Title: Pile of Law [exam_outlines] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: Publicly available, unknown license. Assumed to be governed by fair use standards. (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-icj-pcij","title":"Pile of Law [icj-pcij]","text":"Dataset ID: <code>pile_of_law_icj-pcij</code> Title: Pile of Law [icj-pcij] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: None Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-medicaid_policy_guidance","title":"Pile of Law [medicaid_policy_guidance]","text":"Dataset ID: <code>pile_of_law_medicaid_policy_guidance</code> Title: Pile of Law [medicaid_policy_guidance] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: None Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-r_legaladvice","title":"Pile of Law [r_legaladvice]","text":"Dataset ID: <code>pile_of_law_r_legaladvice</code> Title: Pile of Law [r_legaladvice] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: Creative Commons Attribution 4.0 International (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-resource_contracts","title":"Pile of Law [resource_contracts]","text":"Dataset ID: <code>pile_of_law_resource_contracts</code> Title: Pile of Law [resource_contracts] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: None Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-scotus_oral_arguments","title":"Pile of Law [scotus_oral_arguments]","text":"Dataset ID: <code>pile_of_law_scotus_oral_arguments</code> Title: Pile of Law [scotus_oral_arguments] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: Public domain (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-tos","title":"Pile of Law [tos]","text":"Dataset ID: <code>pile_of_law_tos</code> Title: Pile of Law [tos] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: Publicly available, unknown license. Assumed to be governed by fair use standards. (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-un_debates","title":"Pile of Law [un_debates]","text":"Dataset ID: <code>pile_of_law_un_debates</code> Title: Pile of Law [un_debates] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: Public domain (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#pile-of-law-uspto_office_actions","title":"Pile of Law [uspto_office_actions]","text":"Dataset ID: <code>pile_of_law_uspto_office_actions</code> Title: Pile of Law [uspto_office_actions] Description: We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/pile-of-law/pile-of-law] License: None Tokens: N/A"},{"location":"datasets/language_en/#redpajama-data-t1-selected-subsets","title":"RedPajama-Data T1 (selected subsets)","text":"Dataset ID: <code>redpajama_book</code> Title: RedPajama-Data T1 (selected subsets) Description: An Open Source Recipe to Reproduce LLaMA training dataset Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T] License: partially copyrighted/pirated (commercial use: False, sharealike: None) Tokens: 26 B"},{"location":"datasets/language_en/#redpajama-data-t1-selected-subsets_1","title":"RedPajama-Data T1 (selected subsets)","text":"Dataset ID: <code>redpajama_stackexchange</code> Title: RedPajama-Data T1 (selected subsets) Description: An Open Source Recipe to Reproduce LLaMA training dataset Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T] License: cc-by-sa 4.0 (commercial use: True, sharealike: False) Tokens: 20 B"},{"location":"datasets/language_en/#wura-english","title":"WURA [English]","text":"Dataset ID: <code>wura_en</code> Title: WURA [English] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_en/#wikihow","title":"WikiHow","text":"Dataset ID: <code>wikihow</code> Title: WikiHow Description: WikiHow is a new large-scale dataset using the online WikiHow Availibility: <code>direct_download</code> Homepage: [https://github.com/mahnazkoupaee/WikiHow-Dataset] License: CC BY-NC-SA 3.0 (commercial use: False, sharealike: True) Tokens: 2 M"},{"location":"datasets/language_en/#wikibooks-en","title":"Wikibooks [en]","text":"Dataset ID: <code>wikibooks_en</code> Title: Wikibooks [en] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 129 M"},{"location":"datasets/language_en/#wikinews-en","title":"Wikinews [en]","text":"Dataset ID: <code>wikinews_en</code> Title: Wikinews [en] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 15 M"},{"location":"datasets/language_en/#wikipedia-en","title":"Wikipedia [en]","text":"Dataset ID: <code>wiki_en</code> Title: Wikipedia [en] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 4 B"},{"location":"datasets/language_en/#wikiquote-en","title":"Wikiquote [en]","text":"Dataset ID: <code>wikiquote_en</code> Title: Wikiquote [en] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 125 M"},{"location":"datasets/language_en/#wikisource-en","title":"Wikisource [en]","text":"Dataset ID: <code>wikisource_en</code> Title: Wikisource [en] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 731 M"},{"location":"datasets/language_en/#wikivoyage-en","title":"Wikivoyage [en]","text":"Dataset ID: <code>wikivoyage_en</code> Title: Wikivoyage [en] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 48 M"},{"location":"datasets/language_en/#pes2o","title":"peS2o","text":"Dataset ID: <code>pes2o</code> Title: peS2o Description: The peS2o dataset is a collection of ~40M creative open-access academic papers, cleaned, filtered, and formatted for pre-training of language models. It is derived from the Semantic Scholar Open Research Corpus(Lo et al, 2020), or S2ORC. Availibility: <code>None</code> Homepage: [https://huggingface.co/datasets/allenai/peS2o] License: Open Data Commons Attribution License (ODC-By) v1.0 (commercial use: True, sharealike: False) Tokens: 42 B"},{"location":"datasets/language_en/#proof-pile","title":"proof-pile","text":"Dataset ID: <code>proof_pile</code> Title: proof-pile Description: The proof-pile is a 13GB pre-training dataset of mathematical text that comprises 8.3 billion tokens (using the gpt-neox tokenizer). The dataset is composed of diverse sources of both informal and formal mathematics, namely Availibility: <code>None</code> Homepage: [https://huggingface.co/datasets/hoskinson-center/proof-pile] License: Apache 2.0 (probably code license instead of data license) (commercial use: None, sharealike: None) Tokens: 8 B <p>This page is automatically generated.</p>"},{"location":"datasets/language_eo/","title":"Esperanto Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Esperanto language.</p>"},{"location":"datasets/language_eo/#colossal-oscar-1-eo-2015-14","title":"Colossal OSCAR 1 [eo; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_eo</code> Title: Colossal OSCAR 1 [eo; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eo/#colossal-oscar-1-eo-2016-40","title":"Colossal OSCAR 1 [eo; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_eo</code> Title: Colossal OSCAR 1 [eo; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eo/#colossal-oscar-1-eo-2017-43","title":"Colossal OSCAR 1 [eo; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_eo</code> Title: Colossal OSCAR 1 [eo; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eo/#colossal-oscar-1-eo-2018-47","title":"Colossal OSCAR 1 [eo; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_eo</code> Title: Colossal OSCAR 1 [eo; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eo/#colossal-oscar-1-eo-2019-22","title":"Colossal OSCAR 1 [eo; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_eo</code> Title: Colossal OSCAR 1 [eo; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eo/#colossal-oscar-1-eo-2020-24","title":"Colossal OSCAR 1 [eo; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_eo</code> Title: Colossal OSCAR 1 [eo; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eo/#colossal-oscar-1-eo-2020-45","title":"Colossal OSCAR 1 [eo; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_eo</code> Title: Colossal OSCAR 1 [eo; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eo/#colossal-oscar-1-eo-2021-49","title":"Colossal OSCAR 1 [eo; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_eo</code> Title: Colossal OSCAR 1 [eo; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eo/#colossal-oscar-1-eo-2022-27","title":"Colossal OSCAR 1 [eo; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_eo</code> Title: Colossal OSCAR 1 [eo; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eo/#colossal-oscar-1-eo-2022-49","title":"Colossal OSCAR 1 [eo; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_eo</code> Title: Colossal OSCAR 1 [eo; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eo/#colossal-oscar-1-eo-2023-14","title":"Colossal OSCAR 1 [eo; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_eo</code> Title: Colossal OSCAR 1 [eo; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eo/#colossal-oscar-1-eo-2023-23","title":"Colossal OSCAR 1 [eo; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_eo</code> Title: Colossal OSCAR 1 [eo; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_es/","title":"Spanish Datasets","text":"<p>There are in total 21 datasets with 20 B tokens in Spanish language.</p>"},{"location":"datasets/language_es/#colossal-oscar-1-es-2015-14","title":"Colossal OSCAR 1 [es; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_es</code> Title: Colossal OSCAR 1 [es; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_es/#colossal-oscar-1-es-2016-40","title":"Colossal OSCAR 1 [es; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_es</code> Title: Colossal OSCAR 1 [es; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_es/#colossal-oscar-1-es-2017-43","title":"Colossal OSCAR 1 [es; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_es</code> Title: Colossal OSCAR 1 [es; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_es/#colossal-oscar-1-es-2018-47","title":"Colossal OSCAR 1 [es; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_es</code> Title: Colossal OSCAR 1 [es; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_es/#colossal-oscar-1-es-2019-22","title":"Colossal OSCAR 1 [es; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_es</code> Title: Colossal OSCAR 1 [es; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_es/#colossal-oscar-1-es-2020-24","title":"Colossal OSCAR 1 [es; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_es</code> Title: Colossal OSCAR 1 [es; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_es/#colossal-oscar-1-es-2020-45","title":"Colossal OSCAR 1 [es; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_es</code> Title: Colossal OSCAR 1 [es; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_es/#colossal-oscar-1-es-2021-49","title":"Colossal OSCAR 1 [es; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_es</code> Title: Colossal OSCAR 1 [es; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_es/#colossal-oscar-1-es-2022-27","title":"Colossal OSCAR 1 [es; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_es</code> Title: Colossal OSCAR 1 [es; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_es/#colossal-oscar-1-es-2022-49","title":"Colossal OSCAR 1 [es; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_es</code> Title: Colossal OSCAR 1 [es; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_es/#colossal-oscar-1-es-2023-14","title":"Colossal OSCAR 1 [es; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_es</code> Title: Colossal OSCAR 1 [es; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_es/#colossal-oscar-1-es-2023-23","title":"Colossal OSCAR 1 [es; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_es</code> Title: Colossal OSCAR 1 [es; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_es/#eurlexresources-es","title":"EurlexResources [es]","text":"Dataset ID: <code>eurlex_es</code> Title: EurlexResources [es] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 7 B"},{"location":"datasets/language_es/#legalmc4-es","title":"LegalMC4 [es]","text":"Dataset ID: <code>legal_mc4_es</code> Title: LegalMC4 [es] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 9 B"},{"location":"datasets/language_es/#spanish-legal-domain-corpora","title":"Spanish Legal Domain Corpora","text":"Dataset ID: <code>spanish_legal</code> Title: Spanish Legal Domain Corpora Description: A collection of corpora of Spanish legal domain. Availibility: <code>None</code> Homepage: [https://github.com/PlanTL-GOB-ES/lm-legal-es] License: Creative Commons Attribution 4.0 International (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_es/#wikibooks-es","title":"Wikibooks [es]","text":"Dataset ID: <code>wikibooks_es</code> Title: Wikibooks [es] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 24 M"},{"location":"datasets/language_es/#wikinews-es","title":"Wikinews [es]","text":"Dataset ID: <code>wikinews_es</code> Title: Wikinews [es] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 7 M"},{"location":"datasets/language_es/#wikipedia-es","title":"Wikipedia [es]","text":"Dataset ID: <code>wiki_es</code> Title: Wikipedia [es] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 1 B"},{"location":"datasets/language_es/#wikiquote-es","title":"Wikiquote [es]","text":"Dataset ID: <code>wikiquote_es</code> Title: Wikiquote [es] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 5 M"},{"location":"datasets/language_es/#wikisource-es","title":"Wikisource [es]","text":"Dataset ID: <code>wikisource_es</code> Title: Wikisource [es] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 112 M"},{"location":"datasets/language_es/#wikivoyage-es","title":"Wikivoyage [es]","text":"Dataset ID: <code>wikivoyage_es</code> Title: Wikivoyage [es] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 14 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_et/","title":"Estonian Datasets","text":"<p>There are in total 21 datasets with 5 B tokens in Estonian language.</p>"},{"location":"datasets/language_et/#colossal-oscar-1-et-2015-14","title":"Colossal OSCAR 1 [et; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_et</code> Title: Colossal OSCAR 1 [et; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_et/#colossal-oscar-1-et-2016-40","title":"Colossal OSCAR 1 [et; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_et</code> Title: Colossal OSCAR 1 [et; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_et/#colossal-oscar-1-et-2017-43","title":"Colossal OSCAR 1 [et; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_et</code> Title: Colossal OSCAR 1 [et; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_et/#colossal-oscar-1-et-2018-47","title":"Colossal OSCAR 1 [et; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_et</code> Title: Colossal OSCAR 1 [et; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_et/#colossal-oscar-1-et-2019-22","title":"Colossal OSCAR 1 [et; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_et</code> Title: Colossal OSCAR 1 [et; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_et/#colossal-oscar-1-et-2020-24","title":"Colossal OSCAR 1 [et; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_et</code> Title: Colossal OSCAR 1 [et; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_et/#colossal-oscar-1-et-2020-45","title":"Colossal OSCAR 1 [et; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_et</code> Title: Colossal OSCAR 1 [et; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_et/#colossal-oscar-1-et-2021-49","title":"Colossal OSCAR 1 [et; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_et</code> Title: Colossal OSCAR 1 [et; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_et/#colossal-oscar-1-et-2022-27","title":"Colossal OSCAR 1 [et; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_et</code> Title: Colossal OSCAR 1 [et; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_et/#colossal-oscar-1-et-2022-49","title":"Colossal OSCAR 1 [et; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_et</code> Title: Colossal OSCAR 1 [et; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_et/#colossal-oscar-1-et-2023-14","title":"Colossal OSCAR 1 [et; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_et</code> Title: Colossal OSCAR 1 [et; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_et/#colossal-oscar-1-et-2023-23","title":"Colossal OSCAR 1 [et; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_et</code> Title: Colossal OSCAR 1 [et; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 1 B"},{"location":"datasets/language_et/#ekspress-news-article-archive-only-estonian-10","title":"Ekspress news article archive (only Estonian) 1.0","text":"Dataset ID: <code>ekspress</code> Title: Ekspress news article archive (only Estonian) 1.0 Description: The dataset is an archive of articles from the Ekspress Meedia news site from 2009-2019, containing over 1.4M articles, mostly in Estonian language (1,115,120 articles) with some  in Russian (325,952 articles). Availibility: <code>None</code> Homepage: [https://www.clarin.si/repository/xmlui/handle/11356/1408] License: Creative Commons - Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) (commercial use: False, sharealike: None) Tokens: N/A"},{"location":"datasets/language_et/#estonian-national-corpus-2021","title":"Estonian National Corpus 2021","text":"Dataset ID: <code>enc2021</code> Title: Estonian National Corpus 2021 Description: Corpus is based on Estonian National Corpus 2013, which was renewed by Lexical Computing Ltd. in 2017 and 2019 at the request of Estonian Language Institute.Subcorpora are: Estonian Reference Corpus 1990-2008, Estonian Web 2013, Estonian Web 2017, Estonian Web 2019, Estonian Wikipedia 2017, Estonian Wikipedia 2019, Estonian Open Access Journals (DOAJ), blogs, discussion, education, fiction, food, health, journals, news, religion, science, sex, society, sports. Availibility: <code>None</code> Homepage: [https://entu.keeleressursid.ee/shared/9939/EVKultjxSeFA2QhFkbE7fGGDGNT1zmJLOUGFK9hw53tq9Rx2YBTejI1IoKhy65zq] License: Creative Commons Attribution-NonCommercial 4.0 International License (commercial use: False, sharealike: False) Tokens: N/A"},{"location":"datasets/language_et/#estonian-reference-corpus","title":"Estonian Reference Corpus","text":"Dataset ID: <code>estonian_reference_corpus</code> Title: Estonian Reference Corpus Description: This corpus includes Estonian texts (fiction, PhD theses, newspapers, magazines, parliamentary transcriptions, computer-mediated communication) published between 1990 and 2007. The corpus is encoded in TEI. The corpus is available for online browsing through a dedicated concordancer and is available for download from CELR. Availibility: <code>direct_download</code> Homepage: [https://www.cl.ut.ee/korpused/segakorpus/] License: free for non-commercial use (commercial use: False, sharealike: False) Tokens: 175 M"},{"location":"datasets/language_et/#eurlexresources-et","title":"EurlexResources [et]","text":"Dataset ID: <code>eurlex_et</code> Title: EurlexResources [et] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_et/#legalmc4-et","title":"LegalMC4 [et]","text":"Dataset ID: <code>legal_mc4_et</code> Title: LegalMC4 [et] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 110 M"},{"location":"datasets/language_et/#wikibooks-et","title":"Wikibooks [et]","text":"Dataset ID: <code>wikibooks_et</code> Title: Wikibooks [et] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 1 M"},{"location":"datasets/language_et/#wikipedia-et","title":"Wikipedia [et]","text":"Dataset ID: <code>wiki_et</code> Title: Wikipedia [et] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 61 M"},{"location":"datasets/language_et/#wikiquote-et","title":"Wikiquote [et]","text":"Dataset ID: <code>wikiquote_et</code> Title: Wikiquote [et] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 13 M"},{"location":"datasets/language_et/#wikisource-et","title":"Wikisource [et]","text":"Dataset ID: <code>wikisource_et</code> Title: Wikisource [et] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 1 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_eu/","title":"Basque Datasets","text":"<p>There are in total 19 datasets with 982 M tokens in Basque language.</p>"},{"location":"datasets/language_eu/#colossal-oscar-1-eu-2015-14","title":"Colossal OSCAR 1 [eu; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_eu</code> Title: Colossal OSCAR 1 [eu; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#colossal-oscar-1-eu-2016-40","title":"Colossal OSCAR 1 [eu; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_eu</code> Title: Colossal OSCAR 1 [eu; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#colossal-oscar-1-eu-2017-43","title":"Colossal OSCAR 1 [eu; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_eu</code> Title: Colossal OSCAR 1 [eu; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#colossal-oscar-1-eu-2018-47","title":"Colossal OSCAR 1 [eu; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_eu</code> Title: Colossal OSCAR 1 [eu; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#colossal-oscar-1-eu-2019-22","title":"Colossal OSCAR 1 [eu; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_eu</code> Title: Colossal OSCAR 1 [eu; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#colossal-oscar-1-eu-2020-24","title":"Colossal OSCAR 1 [eu; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_eu</code> Title: Colossal OSCAR 1 [eu; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#colossal-oscar-1-eu-2020-45","title":"Colossal OSCAR 1 [eu; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_eu</code> Title: Colossal OSCAR 1 [eu; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#colossal-oscar-1-eu-2021-49","title":"Colossal OSCAR 1 [eu; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_eu</code> Title: Colossal OSCAR 1 [eu; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#colossal-oscar-1-eu-2022-27","title":"Colossal OSCAR 1 [eu; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_eu</code> Title: Colossal OSCAR 1 [eu; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#colossal-oscar-1-eu-2022-49","title":"Colossal OSCAR 1 [eu; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_eu</code> Title: Colossal OSCAR 1 [eu; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#colossal-oscar-1-eu-2023-14","title":"Colossal OSCAR 1 [eu; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_eu</code> Title: Colossal OSCAR 1 [eu; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#colossal-oscar-1-eu-2023-23","title":"Colossal OSCAR 1 [eu; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_eu</code> Title: Colossal OSCAR 1 [eu; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 136 M"},{"location":"datasets/language_eu/#euscrawl","title":"EusCrawl","text":"Dataset ID: <code>euscrawl</code> Title: EusCrawl Description: EusCrawl (http://www.ixa.eus/euscrawl/) is a high-quality corpus for Basque comprising 12.5 million documents and 423 million tokens, totalling 2.1 GiB of uncompressed text. EusCrawl was built using ad-hoc scrapers to extract text from 33 Basque websites with high-quality content, resulting in cleaner text compared to general purpose approaches. Availibility: <code>None</code> Homepage: [None] License: mixed (see Tab. 2 in paper, e.g., CC-BY-NC-ND, CC-BY-NC-SA) (commercial use: None, sharealike: None) Tokens: 423 M"},{"location":"datasets/language_eu/#euscrawl-filtered-no-wikipedia-no-nc-licenses","title":"EusCrawl (filtered: no Wikipedia, no NC-licenses)","text":"Dataset ID: <code>euscrawl_filtered</code> Title: EusCrawl (filtered: no Wikipedia, no NC-licenses) Description: EusCrawl (http://www.ixa.eus/euscrawl/) is a high-quality corpus for Basque comprising 12.5 million documents and 423 million tokens, totalling 2.1 GiB of uncompressed text. EusCrawl was built using ad-hoc scrapers to extract text from 33 Basque websites with high-quality content, resulting in cleaner text compared to general purpose approaches. Availibility: <code>None</code> Homepage: [None] License: CC-BY-SA (commercial use: True, sharealike: False) Tokens: 423 M"},{"location":"datasets/language_eu/#wikibooks-eu","title":"Wikibooks [eu]","text":"Dataset ID: <code>wikibooks_eu</code> Title: Wikibooks [eu] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#wikinews-eu","title":"Wikinews [eu]","text":"Dataset ID: <code>wikinews_eu</code> Title: Wikinews [eu] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#wikipedia-eu","title":"Wikipedia [eu]","text":"Dataset ID: <code>wiki_eu</code> Title: Wikipedia [eu] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#wikiquote-eu","title":"Wikiquote [eu]","text":"Dataset ID: <code>wikiquote_eu</code> Title: Wikiquote [eu] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_eu/#wikisource-eu","title":"Wikisource [eu]","text":"Dataset ID: <code>wikisource_eu</code> Title: Wikisource [eu] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_fa/","title":"Persian Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Persian language.</p>"},{"location":"datasets/language_fa/#colossal-oscar-1-fa-2015-14","title":"Colossal OSCAR 1 [fa; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_fa</code> Title: Colossal OSCAR 1 [fa; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fa/#colossal-oscar-1-fa-2016-40","title":"Colossal OSCAR 1 [fa; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_fa</code> Title: Colossal OSCAR 1 [fa; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fa/#colossal-oscar-1-fa-2017-43","title":"Colossal OSCAR 1 [fa; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_fa</code> Title: Colossal OSCAR 1 [fa; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fa/#colossal-oscar-1-fa-2018-47","title":"Colossal OSCAR 1 [fa; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_fa</code> Title: Colossal OSCAR 1 [fa; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fa/#colossal-oscar-1-fa-2019-22","title":"Colossal OSCAR 1 [fa; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_fa</code> Title: Colossal OSCAR 1 [fa; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fa/#colossal-oscar-1-fa-2020-24","title":"Colossal OSCAR 1 [fa; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_fa</code> Title: Colossal OSCAR 1 [fa; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fa/#colossal-oscar-1-fa-2020-45","title":"Colossal OSCAR 1 [fa; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_fa</code> Title: Colossal OSCAR 1 [fa; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fa/#colossal-oscar-1-fa-2021-49","title":"Colossal OSCAR 1 [fa; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_fa</code> Title: Colossal OSCAR 1 [fa; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fa/#colossal-oscar-1-fa-2022-27","title":"Colossal OSCAR 1 [fa; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_fa</code> Title: Colossal OSCAR 1 [fa; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fa/#colossal-oscar-1-fa-2022-49","title":"Colossal OSCAR 1 [fa; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_fa</code> Title: Colossal OSCAR 1 [fa; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fa/#colossal-oscar-1-fa-2023-14","title":"Colossal OSCAR 1 [fa; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_fa</code> Title: Colossal OSCAR 1 [fa; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fa/#colossal-oscar-1-fa-2023-23","title":"Colossal OSCAR 1 [fa; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_fa</code> Title: Colossal OSCAR 1 [fa; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_fi/","title":"Finnish Datasets","text":"<p>There are in total 21 datasets with 9 B tokens in Finnish language.</p>"},{"location":"datasets/language_fi/#colossal-oscar-1-fi-2015-14","title":"Colossal OSCAR 1 [fi; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_fi</code> Title: Colossal OSCAR 1 [fi; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fi/#colossal-oscar-1-fi-2016-40","title":"Colossal OSCAR 1 [fi; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_fi</code> Title: Colossal OSCAR 1 [fi; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fi/#colossal-oscar-1-fi-2017-43","title":"Colossal OSCAR 1 [fi; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_fi</code> Title: Colossal OSCAR 1 [fi; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fi/#colossal-oscar-1-fi-2018-47","title":"Colossal OSCAR 1 [fi; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_fi</code> Title: Colossal OSCAR 1 [fi; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fi/#colossal-oscar-1-fi-2019-22","title":"Colossal OSCAR 1 [fi; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_fi</code> Title: Colossal OSCAR 1 [fi; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fi/#colossal-oscar-1-fi-2020-24","title":"Colossal OSCAR 1 [fi; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_fi</code> Title: Colossal OSCAR 1 [fi; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fi/#colossal-oscar-1-fi-2020-45","title":"Colossal OSCAR 1 [fi; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_fi</code> Title: Colossal OSCAR 1 [fi; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fi/#colossal-oscar-1-fi-2021-49","title":"Colossal OSCAR 1 [fi; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_fi</code> Title: Colossal OSCAR 1 [fi; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fi/#colossal-oscar-1-fi-2022-27","title":"Colossal OSCAR 1 [fi; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_fi</code> Title: Colossal OSCAR 1 [fi; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fi/#colossal-oscar-1-fi-2022-49","title":"Colossal OSCAR 1 [fi; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_fi</code> Title: Colossal OSCAR 1 [fi; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fi/#colossal-oscar-1-fi-2023-14","title":"Colossal OSCAR 1 [fi; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_fi</code> Title: Colossal OSCAR 1 [fi; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fi/#colossal-oscar-1-fi-2023-23","title":"Colossal OSCAR 1 [fi; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_fi</code> Title: Colossal OSCAR 1 [fi; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 4 B"},{"location":"datasets/language_fi/#eurlexresources-fi","title":"EurlexResources [fi]","text":"Dataset ID: <code>eurlex_fi</code> Title: EurlexResources [fi] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 4 B"},{"location":"datasets/language_fi/#legalmc4-fi","title":"LegalMC4 [fi]","text":"Dataset ID: <code>legal_mc4_fi</code> Title: LegalMC4 [fi] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 63 M"},{"location":"datasets/language_fi/#wikibooks-fi","title":"Wikibooks [fi]","text":"Dataset ID: <code>wikibooks_fi</code> Title: Wikibooks [fi] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 5 M"},{"location":"datasets/language_fi/#wikinews-fi","title":"Wikinews [fi]","text":"Dataset ID: <code>wikinews_fi</code> Title: Wikinews [fi] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 748 k"},{"location":"datasets/language_fi/#wikipedia-fi","title":"Wikipedia [fi]","text":"Dataset ID: <code>wiki_fi</code> Title: Wikipedia [fi] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 137 M"},{"location":"datasets/language_fi/#wikiquote-fi","title":"Wikiquote [fi]","text":"Dataset ID: <code>wikiquote_fi</code> Title: Wikiquote [fi] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 2 M"},{"location":"datasets/language_fi/#wikisource-fi","title":"Wikisource [fi]","text":"Dataset ID: <code>wikisource_fi</code> Title: Wikisource [fi] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 18 M"},{"location":"datasets/language_fi/#wikivoyage-fi","title":"Wikivoyage [fi]","text":"Dataset ID: <code>wikivoyage_fi</code> Title: Wikivoyage [fi] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 1 M"},{"location":"datasets/language_fi/#yle-finnish-news-archive","title":"Yle Finnish News Archive","text":"Dataset ID: <code>ylenews</code> Title: Yle Finnish News Archive Description: The corpus, containing the articles from YLE https://yle.fi from 2019 and 2020, is available at www.kielipankki.fi/download Availibility: <code>signin_download</code> Homepage: [http://urn.fi/urn:nbn:fi:lb-2021050401] License: CLARIN ACA - NC (Academic - Non Commercial Use, Attribution, No Redistribution, Other) (commercial use: False, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_fr/","title":"French Datasets","text":"<p>There are in total 22 datasets with 60 B tokens in French language.</p>"},{"location":"datasets/language_fr/#cabernet-a-new-french-balanced-reference-corpus","title":"CaBeRnet: a New French Balanced Reference Corpus","text":"Dataset ID: <code>cabernet</code> Title: CaBeRnet: a New French Balanced Reference Corpus Description: A new balanced French corpus, CaBeRnet, that features a representative range of language usage, including a balanced variety of genres (oral transcriptions, newspapers, popular magazines, technical reports, fiction, academic texts), in oral and written styles. Availibility: <code>None</code> Homepage: [https://aclanthology.org/2020.cmlc-1.3/] License: Creative Commons License (commercial use: None, sharealike: None) Tokens: 712 M"},{"location":"datasets/language_fr/#colossal-oscar-1-fr-2015-14","title":"Colossal OSCAR 1 [fr; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_fr</code> Title: Colossal OSCAR 1 [fr; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fr/#colossal-oscar-1-fr-2016-40","title":"Colossal OSCAR 1 [fr; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_fr</code> Title: Colossal OSCAR 1 [fr; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fr/#colossal-oscar-1-fr-2017-43","title":"Colossal OSCAR 1 [fr; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_fr</code> Title: Colossal OSCAR 1 [fr; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fr/#colossal-oscar-1-fr-2018-47","title":"Colossal OSCAR 1 [fr; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_fr</code> Title: Colossal OSCAR 1 [fr; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fr/#colossal-oscar-1-fr-2019-22","title":"Colossal OSCAR 1 [fr; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_fr</code> Title: Colossal OSCAR 1 [fr; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fr/#colossal-oscar-1-fr-2020-24","title":"Colossal OSCAR 1 [fr; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_fr</code> Title: Colossal OSCAR 1 [fr; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fr/#colossal-oscar-1-fr-2020-45","title":"Colossal OSCAR 1 [fr; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_fr</code> Title: Colossal OSCAR 1 [fr; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fr/#colossal-oscar-1-fr-2021-49","title":"Colossal OSCAR 1 [fr; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_fr</code> Title: Colossal OSCAR 1 [fr; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fr/#colossal-oscar-1-fr-2022-27","title":"Colossal OSCAR 1 [fr; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_fr</code> Title: Colossal OSCAR 1 [fr; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fr/#colossal-oscar-1-fr-2022-49","title":"Colossal OSCAR 1 [fr; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_fr</code> Title: Colossal OSCAR 1 [fr; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fr/#colossal-oscar-1-fr-2023-14","title":"Colossal OSCAR 1 [fr; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_fr</code> Title: Colossal OSCAR 1 [fr; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fr/#colossal-oscar-1-fr-2023-23","title":"Colossal OSCAR 1 [fr; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_fr</code> Title: Colossal OSCAR 1 [fr; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 48 B"},{"location":"datasets/language_fr/#eurlexresources-fr","title":"EurlexResources [fr]","text":"Dataset ID: <code>eurlex_fr</code> Title: EurlexResources [fr] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 8 B"},{"location":"datasets/language_fr/#legalmc4-fr","title":"LegalMC4 [fr]","text":"Dataset ID: <code>legal_mc4_fr</code> Title: LegalMC4 [fr] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 2 B"},{"location":"datasets/language_fr/#wura-french","title":"WURA [French]","text":"Dataset ID: <code>wura_fr</code> Title: WURA [French] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fr/#wikibooks-fr","title":"Wikibooks [fr]","text":"Dataset ID: <code>wikibooks_fr</code> Title: Wikibooks [fr] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 24 M"},{"location":"datasets/language_fr/#wikinews-fr","title":"Wikinews [fr]","text":"Dataset ID: <code>wikinews_fr</code> Title: Wikinews [fr] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 8 M"},{"location":"datasets/language_fr/#wikipedia-fr","title":"Wikipedia [fr]","text":"Dataset ID: <code>wiki_fr</code> Title: Wikipedia [fr] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 1 B"},{"location":"datasets/language_fr/#wikiquote-fr","title":"Wikiquote [fr]","text":"Dataset ID: <code>wikiquote_fr</code> Title: Wikiquote [fr] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 473 k"},{"location":"datasets/language_fr/#wikisource-fr","title":"Wikisource [fr]","text":"Dataset ID: <code>wikisource_fr</code> Title: Wikisource [fr] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 38 M"},{"location":"datasets/language_fr/#wikivoyage-fr","title":"Wikivoyage [fr]","text":"Dataset ID: <code>wikivoyage_fr</code> Title: Wikivoyage [fr] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 7 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_fy/","title":"Western Frisian Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Western Frisian language.</p>"},{"location":"datasets/language_fy/#colossal-oscar-1-fy-2015-14","title":"Colossal OSCAR 1 [fy; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_fy</code> Title: Colossal OSCAR 1 [fy; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fy/#colossal-oscar-1-fy-2016-40","title":"Colossal OSCAR 1 [fy; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_fy</code> Title: Colossal OSCAR 1 [fy; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fy/#colossal-oscar-1-fy-2017-43","title":"Colossal OSCAR 1 [fy; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_fy</code> Title: Colossal OSCAR 1 [fy; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fy/#colossal-oscar-1-fy-2018-47","title":"Colossal OSCAR 1 [fy; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_fy</code> Title: Colossal OSCAR 1 [fy; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fy/#colossal-oscar-1-fy-2019-22","title":"Colossal OSCAR 1 [fy; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_fy</code> Title: Colossal OSCAR 1 [fy; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fy/#colossal-oscar-1-fy-2020-24","title":"Colossal OSCAR 1 [fy; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_fy</code> Title: Colossal OSCAR 1 [fy; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fy/#colossal-oscar-1-fy-2020-45","title":"Colossal OSCAR 1 [fy; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_fy</code> Title: Colossal OSCAR 1 [fy; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fy/#colossal-oscar-1-fy-2021-49","title":"Colossal OSCAR 1 [fy; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_fy</code> Title: Colossal OSCAR 1 [fy; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fy/#colossal-oscar-1-fy-2022-27","title":"Colossal OSCAR 1 [fy; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_fy</code> Title: Colossal OSCAR 1 [fy; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fy/#colossal-oscar-1-fy-2022-49","title":"Colossal OSCAR 1 [fy; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_fy</code> Title: Colossal OSCAR 1 [fy; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fy/#colossal-oscar-1-fy-2023-14","title":"Colossal OSCAR 1 [fy; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_fy</code> Title: Colossal OSCAR 1 [fy; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_fy/#colossal-oscar-1-fy-2023-23","title":"Colossal OSCAR 1 [fy; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_fy</code> Title: Colossal OSCAR 1 [fy; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ga/","title":"Irish Datasets","text":"<p>There are in total 19 datasets with 669 M tokens in Irish language.</p>"},{"location":"datasets/language_ga/#colossal-oscar-1-ga-2015-14","title":"Colossal OSCAR 1 [ga; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ga</code> Title: Colossal OSCAR 1 [ga; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ga/#colossal-oscar-1-ga-2016-40","title":"Colossal OSCAR 1 [ga; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ga</code> Title: Colossal OSCAR 1 [ga; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ga/#colossal-oscar-1-ga-2017-43","title":"Colossal OSCAR 1 [ga; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ga</code> Title: Colossal OSCAR 1 [ga; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ga/#colossal-oscar-1-ga-2018-47","title":"Colossal OSCAR 1 [ga; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ga</code> Title: Colossal OSCAR 1 [ga; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ga/#colossal-oscar-1-ga-2019-22","title":"Colossal OSCAR 1 [ga; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ga</code> Title: Colossal OSCAR 1 [ga; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ga/#colossal-oscar-1-ga-2020-24","title":"Colossal OSCAR 1 [ga; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ga</code> Title: Colossal OSCAR 1 [ga; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ga/#colossal-oscar-1-ga-2020-45","title":"Colossal OSCAR 1 [ga; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ga</code> Title: Colossal OSCAR 1 [ga; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ga/#colossal-oscar-1-ga-2021-49","title":"Colossal OSCAR 1 [ga; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ga</code> Title: Colossal OSCAR 1 [ga; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ga/#colossal-oscar-1-ga-2022-27","title":"Colossal OSCAR 1 [ga; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ga</code> Title: Colossal OSCAR 1 [ga; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ga/#colossal-oscar-1-ga-2022-49","title":"Colossal OSCAR 1 [ga; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ga</code> Title: Colossal OSCAR 1 [ga; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ga/#colossal-oscar-1-ga-2023-14","title":"Colossal OSCAR 1 [ga; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ga</code> Title: Colossal OSCAR 1 [ga; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ga/#colossal-oscar-1-ga-2023-23","title":"Colossal OSCAR 1 [ga; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ga</code> Title: Colossal OSCAR 1 [ga; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 5 M"},{"location":"datasets/language_ga/#eurlexresources-ga","title":"EurlexResources [ga]","text":"Dataset ID: <code>eurlex_ga</code> Title: EurlexResources [ga] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 650 M"},{"location":"datasets/language_ga/#irish-universal-dependencies","title":"Irish Universal Dependencies","text":"Dataset ID: <code>ga_universal_dependencies</code> Title: Irish Universal Dependencies Description: Universal Dependencies (UD) is a framework for consistent annotation of grammar (parts of speech, morphological features, and syntactic dependencies) across different human languages. Availibility: <code>direct_download</code> Homepage: [https://universaldependencies.org/] License: mixed (CC BY-SA 3.0 or CC BY-SA 4.0) (commercial use: True, sharealike: True) Tokens: 3 M"},{"location":"datasets/language_ga/#legalmc4-ga","title":"LegalMC4 [ga]","text":"Dataset ID: <code>legal_mc4_ga</code> Title: LegalMC4 [ga] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 33 k"},{"location":"datasets/language_ga/#the-gaois-bilingual-corpus-of-english-irish-legislation-irish-legislation","title":"The Gaois bilingual corpus of English-Irish legislation (Irish legislation)","text":"Dataset ID: <code>ga_bilingual_legistation</code> Title: The Gaois bilingual corpus of English-Irish legislation (Irish legislation) Description: Bilingual corpus of English-Irish legislation provided by the Department of Justice. Availibility: <code>signin_download</code> Homepage: [https://portulanclarin.net/repository/browse/the-gaois-bilingual-corpus-of-english-irish-legislation-processed/daeac17c9e3511ea9b7f02420a000407b83de243dc0b469aab41084386c5b80f/] License: Open Under - PSI (commercial use: None, sharealike: None) Tokens: 4 M"},{"location":"datasets/language_ga/#wikibooks-ga","title":"Wikibooks [ga]","text":"Dataset ID: <code>wikibooks_ga</code> Title: Wikibooks [ga] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ga/#wikipedia-ga","title":"Wikipedia [ga]","text":"Dataset ID: <code>wiki_ga</code> Title: Wikipedia [ga] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 6 M"},{"location":"datasets/language_ga/#wikiquote-ga","title":"Wikiquote [ga]","text":"Dataset ID: <code>wikiquote_ga</code> Title: Wikiquote [ga] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 233 <p>This page is automatically generated.</p>"},{"location":"datasets/language_gd/","title":"Gaelic Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Gaelic language.</p>"},{"location":"datasets/language_gd/#colossal-oscar-1-gd-2015-14","title":"Colossal OSCAR 1 [gd; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_gd</code> Title: Colossal OSCAR 1 [gd; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gd/#colossal-oscar-1-gd-2016-40","title":"Colossal OSCAR 1 [gd; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_gd</code> Title: Colossal OSCAR 1 [gd; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gd/#colossal-oscar-1-gd-2017-43","title":"Colossal OSCAR 1 [gd; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_gd</code> Title: Colossal OSCAR 1 [gd; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gd/#colossal-oscar-1-gd-2018-47","title":"Colossal OSCAR 1 [gd; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_gd</code> Title: Colossal OSCAR 1 [gd; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gd/#colossal-oscar-1-gd-2019-22","title":"Colossal OSCAR 1 [gd; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_gd</code> Title: Colossal OSCAR 1 [gd; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gd/#colossal-oscar-1-gd-2020-24","title":"Colossal OSCAR 1 [gd; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_gd</code> Title: Colossal OSCAR 1 [gd; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gd/#colossal-oscar-1-gd-2020-45","title":"Colossal OSCAR 1 [gd; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_gd</code> Title: Colossal OSCAR 1 [gd; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gd/#colossal-oscar-1-gd-2021-49","title":"Colossal OSCAR 1 [gd; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_gd</code> Title: Colossal OSCAR 1 [gd; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gd/#colossal-oscar-1-gd-2022-27","title":"Colossal OSCAR 1 [gd; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_gd</code> Title: Colossal OSCAR 1 [gd; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gd/#colossal-oscar-1-gd-2022-49","title":"Colossal OSCAR 1 [gd; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_gd</code> Title: Colossal OSCAR 1 [gd; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gd/#colossal-oscar-1-gd-2023-14","title":"Colossal OSCAR 1 [gd; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_gd</code> Title: Colossal OSCAR 1 [gd; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gd/#colossal-oscar-1-gd-2023-23","title":"Colossal OSCAR 1 [gd; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_gd</code> Title: Colossal OSCAR 1 [gd; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_gl/","title":"Galician Datasets","text":"<p>There are in total 17 datasets with 36 M tokens in Galician language.</p>"},{"location":"datasets/language_gl/#colossal-oscar-1-gl-2015-14","title":"Colossal OSCAR 1 [gl; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_gl</code> Title: Colossal OSCAR 1 [gl; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#colossal-oscar-1-gl-2016-40","title":"Colossal OSCAR 1 [gl; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_gl</code> Title: Colossal OSCAR 1 [gl; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#colossal-oscar-1-gl-2017-43","title":"Colossal OSCAR 1 [gl; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_gl</code> Title: Colossal OSCAR 1 [gl; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#colossal-oscar-1-gl-2018-47","title":"Colossal OSCAR 1 [gl; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_gl</code> Title: Colossal OSCAR 1 [gl; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#colossal-oscar-1-gl-2019-22","title":"Colossal OSCAR 1 [gl; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_gl</code> Title: Colossal OSCAR 1 [gl; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#colossal-oscar-1-gl-2020-24","title":"Colossal OSCAR 1 [gl; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_gl</code> Title: Colossal OSCAR 1 [gl; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#colossal-oscar-1-gl-2020-45","title":"Colossal OSCAR 1 [gl; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_gl</code> Title: Colossal OSCAR 1 [gl; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#colossal-oscar-1-gl-2021-49","title":"Colossal OSCAR 1 [gl; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_gl</code> Title: Colossal OSCAR 1 [gl; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#colossal-oscar-1-gl-2022-27","title":"Colossal OSCAR 1 [gl; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_gl</code> Title: Colossal OSCAR 1 [gl; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#colossal-oscar-1-gl-2022-49","title":"Colossal OSCAR 1 [gl; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_gl</code> Title: Colossal OSCAR 1 [gl; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#colossal-oscar-1-gl-2023-14","title":"Colossal OSCAR 1 [gl; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_gl</code> Title: Colossal OSCAR 1 [gl; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#colossal-oscar-1-gl-2023-23","title":"Colossal OSCAR 1 [gl; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_gl</code> Title: Colossal OSCAR 1 [gl; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 36 M"},{"location":"datasets/language_gl/#wikibooks-gl","title":"Wikibooks [gl]","text":"Dataset ID: <code>wikibooks_gl</code> Title: Wikibooks [gl] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#wikinews-gl","title":"Wikinews [gl]","text":"Dataset ID: <code>wikinews_gl</code> Title: Wikinews [gl] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#wikipedia-gl","title":"Wikipedia [gl]","text":"Dataset ID: <code>wiki_gl</code> Title: Wikipedia [gl] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#wikiquote-gl","title":"Wikiquote [gl]","text":"Dataset ID: <code>wikiquote_gl</code> Title: Wikiquote [gl] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gl/#wikisource-gl","title":"Wikisource [gl]","text":"Dataset ID: <code>wikisource_gl</code> Title: Wikisource [gl] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_gn/","title":"Guaran\u00ed Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Guaran\u00ed language.</p>"},{"location":"datasets/language_gn/#colossal-oscar-1-gn-2015-14","title":"Colossal OSCAR 1 [gn; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_gn</code> Title: Colossal OSCAR 1 [gn; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gn/#colossal-oscar-1-gn-2016-40","title":"Colossal OSCAR 1 [gn; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_gn</code> Title: Colossal OSCAR 1 [gn; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gn/#colossal-oscar-1-gn-2017-43","title":"Colossal OSCAR 1 [gn; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_gn</code> Title: Colossal OSCAR 1 [gn; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gn/#colossal-oscar-1-gn-2018-47","title":"Colossal OSCAR 1 [gn; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_gn</code> Title: Colossal OSCAR 1 [gn; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gn/#colossal-oscar-1-gn-2019-22","title":"Colossal OSCAR 1 [gn; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_gn</code> Title: Colossal OSCAR 1 [gn; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gn/#colossal-oscar-1-gn-2020-24","title":"Colossal OSCAR 1 [gn; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_gn</code> Title: Colossal OSCAR 1 [gn; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gn/#colossal-oscar-1-gn-2020-45","title":"Colossal OSCAR 1 [gn; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_gn</code> Title: Colossal OSCAR 1 [gn; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gn/#colossal-oscar-1-gn-2021-49","title":"Colossal OSCAR 1 [gn; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_gn</code> Title: Colossal OSCAR 1 [gn; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gn/#colossal-oscar-1-gn-2022-27","title":"Colossal OSCAR 1 [gn; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_gn</code> Title: Colossal OSCAR 1 [gn; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gn/#colossal-oscar-1-gn-2022-49","title":"Colossal OSCAR 1 [gn; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_gn</code> Title: Colossal OSCAR 1 [gn; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gn/#colossal-oscar-1-gn-2023-14","title":"Colossal OSCAR 1 [gn; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_gn</code> Title: Colossal OSCAR 1 [gn; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gn/#colossal-oscar-1-gn-2023-23","title":"Colossal OSCAR 1 [gn; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_gn</code> Title: Colossal OSCAR 1 [gn; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_gom/","title":"Gom Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Gom language.</p>"},{"location":"datasets/language_gom/#colossal-oscar-1-gom-2015-14","title":"Colossal OSCAR 1 [gom; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_gom</code> Title: Colossal OSCAR 1 [gom; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gom/#colossal-oscar-1-gom-2016-40","title":"Colossal OSCAR 1 [gom; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_gom</code> Title: Colossal OSCAR 1 [gom; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gom/#colossal-oscar-1-gom-2017-43","title":"Colossal OSCAR 1 [gom; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_gom</code> Title: Colossal OSCAR 1 [gom; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gom/#colossal-oscar-1-gom-2018-47","title":"Colossal OSCAR 1 [gom; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_gom</code> Title: Colossal OSCAR 1 [gom; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gom/#colossal-oscar-1-gom-2019-22","title":"Colossal OSCAR 1 [gom; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_gom</code> Title: Colossal OSCAR 1 [gom; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gom/#colossal-oscar-1-gom-2020-24","title":"Colossal OSCAR 1 [gom; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_gom</code> Title: Colossal OSCAR 1 [gom; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gom/#colossal-oscar-1-gom-2020-45","title":"Colossal OSCAR 1 [gom; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_gom</code> Title: Colossal OSCAR 1 [gom; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gom/#colossal-oscar-1-gom-2021-49","title":"Colossal OSCAR 1 [gom; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_gom</code> Title: Colossal OSCAR 1 [gom; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gom/#colossal-oscar-1-gom-2022-27","title":"Colossal OSCAR 1 [gom; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_gom</code> Title: Colossal OSCAR 1 [gom; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gom/#colossal-oscar-1-gom-2022-49","title":"Colossal OSCAR 1 [gom; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_gom</code> Title: Colossal OSCAR 1 [gom; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gom/#colossal-oscar-1-gom-2023-14","title":"Colossal OSCAR 1 [gom; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_gom</code> Title: Colossal OSCAR 1 [gom; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gom/#colossal-oscar-1-gom-2023-23","title":"Colossal OSCAR 1 [gom; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_gom</code> Title: Colossal OSCAR 1 [gom; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_gsw/","title":"Gsw Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Gsw language.</p>"},{"location":"datasets/language_gsw/#colossal-oscar-1-gsw-2015-14","title":"Colossal OSCAR 1 [gsw; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_gsw</code> Title: Colossal OSCAR 1 [gsw; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gsw/#colossal-oscar-1-gsw-2016-40","title":"Colossal OSCAR 1 [gsw; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_gsw</code> Title: Colossal OSCAR 1 [gsw; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gsw/#colossal-oscar-1-gsw-2017-43","title":"Colossal OSCAR 1 [gsw; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_gsw</code> Title: Colossal OSCAR 1 [gsw; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gsw/#colossal-oscar-1-gsw-2018-47","title":"Colossal OSCAR 1 [gsw; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_gsw</code> Title: Colossal OSCAR 1 [gsw; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gsw/#colossal-oscar-1-gsw-2019-22","title":"Colossal OSCAR 1 [gsw; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_gsw</code> Title: Colossal OSCAR 1 [gsw; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gsw/#colossal-oscar-1-gsw-2020-24","title":"Colossal OSCAR 1 [gsw; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_gsw</code> Title: Colossal OSCAR 1 [gsw; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gsw/#colossal-oscar-1-gsw-2020-45","title":"Colossal OSCAR 1 [gsw; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_gsw</code> Title: Colossal OSCAR 1 [gsw; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gsw/#colossal-oscar-1-gsw-2021-49","title":"Colossal OSCAR 1 [gsw; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_gsw</code> Title: Colossal OSCAR 1 [gsw; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gsw/#colossal-oscar-1-gsw-2022-27","title":"Colossal OSCAR 1 [gsw; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_gsw</code> Title: Colossal OSCAR 1 [gsw; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gsw/#colossal-oscar-1-gsw-2022-49","title":"Colossal OSCAR 1 [gsw; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_gsw</code> Title: Colossal OSCAR 1 [gsw; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gsw/#colossal-oscar-1-gsw-2023-14","title":"Colossal OSCAR 1 [gsw; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_gsw</code> Title: Colossal OSCAR 1 [gsw; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gsw/#colossal-oscar-1-gsw-2023-23","title":"Colossal OSCAR 1 [gsw; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_gsw</code> Title: Colossal OSCAR 1 [gsw; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_gu/","title":"Gujarati Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Gujarati language.</p>"},{"location":"datasets/language_gu/#colossal-oscar-1-gu-2015-14","title":"Colossal OSCAR 1 [gu; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_gu</code> Title: Colossal OSCAR 1 [gu; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gu/#colossal-oscar-1-gu-2016-40","title":"Colossal OSCAR 1 [gu; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_gu</code> Title: Colossal OSCAR 1 [gu; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gu/#colossal-oscar-1-gu-2017-43","title":"Colossal OSCAR 1 [gu; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_gu</code> Title: Colossal OSCAR 1 [gu; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gu/#colossal-oscar-1-gu-2018-47","title":"Colossal OSCAR 1 [gu; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_gu</code> Title: Colossal OSCAR 1 [gu; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gu/#colossal-oscar-1-gu-2019-22","title":"Colossal OSCAR 1 [gu; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_gu</code> Title: Colossal OSCAR 1 [gu; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gu/#colossal-oscar-1-gu-2020-24","title":"Colossal OSCAR 1 [gu; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_gu</code> Title: Colossal OSCAR 1 [gu; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gu/#colossal-oscar-1-gu-2020-45","title":"Colossal OSCAR 1 [gu; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_gu</code> Title: Colossal OSCAR 1 [gu; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gu/#colossal-oscar-1-gu-2021-49","title":"Colossal OSCAR 1 [gu; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_gu</code> Title: Colossal OSCAR 1 [gu; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gu/#colossal-oscar-1-gu-2022-27","title":"Colossal OSCAR 1 [gu; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_gu</code> Title: Colossal OSCAR 1 [gu; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gu/#colossal-oscar-1-gu-2022-49","title":"Colossal OSCAR 1 [gu; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_gu</code> Title: Colossal OSCAR 1 [gu; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gu/#colossal-oscar-1-gu-2023-14","title":"Colossal OSCAR 1 [gu; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_gu</code> Title: Colossal OSCAR 1 [gu; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_gu/#colossal-oscar-1-gu-2023-23","title":"Colossal OSCAR 1 [gu; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_gu</code> Title: Colossal OSCAR 1 [gu; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ha/","title":"Hausa Datasets","text":"<p>There are in total 1 datasets with N/A tokens in Hausa language.</p>"},{"location":"datasets/language_ha/#wura-hausa","title":"WURA [Hausa]","text":"Dataset ID: <code>wura_ha</code> Title: WURA [Hausa] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_he/","title":"Hebrew Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Hebrew language.</p>"},{"location":"datasets/language_he/#colossal-oscar-1-he-2015-14","title":"Colossal OSCAR 1 [he; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_he</code> Title: Colossal OSCAR 1 [he; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_he/#colossal-oscar-1-he-2016-40","title":"Colossal OSCAR 1 [he; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_he</code> Title: Colossal OSCAR 1 [he; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_he/#colossal-oscar-1-he-2017-43","title":"Colossal OSCAR 1 [he; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_he</code> Title: Colossal OSCAR 1 [he; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_he/#colossal-oscar-1-he-2018-47","title":"Colossal OSCAR 1 [he; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_he</code> Title: Colossal OSCAR 1 [he; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_he/#colossal-oscar-1-he-2019-22","title":"Colossal OSCAR 1 [he; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_he</code> Title: Colossal OSCAR 1 [he; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_he/#colossal-oscar-1-he-2020-24","title":"Colossal OSCAR 1 [he; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_he</code> Title: Colossal OSCAR 1 [he; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_he/#colossal-oscar-1-he-2020-45","title":"Colossal OSCAR 1 [he; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_he</code> Title: Colossal OSCAR 1 [he; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_he/#colossal-oscar-1-he-2021-49","title":"Colossal OSCAR 1 [he; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_he</code> Title: Colossal OSCAR 1 [he; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_he/#colossal-oscar-1-he-2022-27","title":"Colossal OSCAR 1 [he; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_he</code> Title: Colossal OSCAR 1 [he; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_he/#colossal-oscar-1-he-2022-49","title":"Colossal OSCAR 1 [he; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_he</code> Title: Colossal OSCAR 1 [he; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_he/#colossal-oscar-1-he-2023-14","title":"Colossal OSCAR 1 [he; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_he</code> Title: Colossal OSCAR 1 [he; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_he/#colossal-oscar-1-he-2023-23","title":"Colossal OSCAR 1 [he; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_he</code> Title: Colossal OSCAR 1 [he; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_hi/","title":"Hindi Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Hindi language.</p>"},{"location":"datasets/language_hi/#colossal-oscar-1-hi-2015-14","title":"Colossal OSCAR 1 [hi; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_hi</code> Title: Colossal OSCAR 1 [hi; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hi/#colossal-oscar-1-hi-2016-40","title":"Colossal OSCAR 1 [hi; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_hi</code> Title: Colossal OSCAR 1 [hi; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hi/#colossal-oscar-1-hi-2017-43","title":"Colossal OSCAR 1 [hi; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_hi</code> Title: Colossal OSCAR 1 [hi; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hi/#colossal-oscar-1-hi-2018-47","title":"Colossal OSCAR 1 [hi; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_hi</code> Title: Colossal OSCAR 1 [hi; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hi/#colossal-oscar-1-hi-2019-22","title":"Colossal OSCAR 1 [hi; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_hi</code> Title: Colossal OSCAR 1 [hi; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hi/#colossal-oscar-1-hi-2020-24","title":"Colossal OSCAR 1 [hi; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_hi</code> Title: Colossal OSCAR 1 [hi; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hi/#colossal-oscar-1-hi-2020-45","title":"Colossal OSCAR 1 [hi; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_hi</code> Title: Colossal OSCAR 1 [hi; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hi/#colossal-oscar-1-hi-2021-49","title":"Colossal OSCAR 1 [hi; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_hi</code> Title: Colossal OSCAR 1 [hi; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hi/#colossal-oscar-1-hi-2022-27","title":"Colossal OSCAR 1 [hi; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_hi</code> Title: Colossal OSCAR 1 [hi; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hi/#colossal-oscar-1-hi-2022-49","title":"Colossal OSCAR 1 [hi; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_hi</code> Title: Colossal OSCAR 1 [hi; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hi/#colossal-oscar-1-hi-2023-14","title":"Colossal OSCAR 1 [hi; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_hi</code> Title: Colossal OSCAR 1 [hi; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hi/#colossal-oscar-1-hi-2023-23","title":"Colossal OSCAR 1 [hi; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_hi</code> Title: Colossal OSCAR 1 [hi; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_hr/","title":"Croatian Datasets","text":"<p>There are in total 22 datasets with 8 B tokens in Croatian language.</p>"},{"location":"datasets/language_hr/#24sata-news-article-archive-10","title":"24sata news article archive 1.0","text":"Dataset ID: <code>styria_news</code> Title: 24sata news article archive 1.0 Description: The 24sata news portal consists of a portal with daily news and several smaller portals covering news from specific topics, such as automotive news, health, culinary content, and lifestyle advice. The dataset contains over  650,000 articles in Croatian from 2007 to 2019, as well as assigned tags. Availibility: <code>direct_download</code> Homepage: [https://www.clarin.si/repository/xmlui/handle/11356/1410] License: Creative Commons - Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) (commercial use: False, sharealike: False) Tokens: 409 M"},{"location":"datasets/language_hr/#curlicat-corpus-croatian","title":"CURLICAT Corpus [Croatian]","text":"Dataset ID: <code>curlicat_hr</code> Title: CURLICAT Corpus [Croatian] Description: The CURLICAT corpus includes 7 monolingual corpora (Bulgarian, Croatian, Hungarian, Polish, Romanian, Slovak and Slovenian) containing selected samples from respective national corpora. Availibility: <code>direct_download</code> Homepage: [https://elrc-share.eu/repository/browse/curlicat-croatian-corpus/00815518592811ed9c1a00155d026706bc4c59740fce4f7986213e7eef133023/] License: unknown (commercial use: None, sharealike: None) Tokens: 49 M"},{"location":"datasets/language_hr/#colossal-oscar-1-hr-2015-14","title":"Colossal OSCAR 1 [hr; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_hr</code> Title: Colossal OSCAR 1 [hr; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hr/#colossal-oscar-1-hr-2016-40","title":"Colossal OSCAR 1 [hr; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_hr</code> Title: Colossal OSCAR 1 [hr; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hr/#colossal-oscar-1-hr-2017-43","title":"Colossal OSCAR 1 [hr; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_hr</code> Title: Colossal OSCAR 1 [hr; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hr/#colossal-oscar-1-hr-2018-47","title":"Colossal OSCAR 1 [hr; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_hr</code> Title: Colossal OSCAR 1 [hr; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hr/#colossal-oscar-1-hr-2019-22","title":"Colossal OSCAR 1 [hr; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_hr</code> Title: Colossal OSCAR 1 [hr; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hr/#colossal-oscar-1-hr-2020-24","title":"Colossal OSCAR 1 [hr; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_hr</code> Title: Colossal OSCAR 1 [hr; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hr/#colossal-oscar-1-hr-2020-45","title":"Colossal OSCAR 1 [hr; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_hr</code> Title: Colossal OSCAR 1 [hr; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hr/#colossal-oscar-1-hr-2021-49","title":"Colossal OSCAR 1 [hr; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_hr</code> Title: Colossal OSCAR 1 [hr; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hr/#colossal-oscar-1-hr-2022-27","title":"Colossal OSCAR 1 [hr; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_hr</code> Title: Colossal OSCAR 1 [hr; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hr/#colossal-oscar-1-hr-2022-49","title":"Colossal OSCAR 1 [hr; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_hr</code> Title: Colossal OSCAR 1 [hr; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hr/#colossal-oscar-1-hr-2023-14","title":"Colossal OSCAR 1 [hr; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_hr</code> Title: Colossal OSCAR 1 [hr; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hr/#colossal-oscar-1-hr-2023-23","title":"Colossal OSCAR 1 [hr; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_hr</code> Title: Colossal OSCAR 1 [hr; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 1 M"},{"location":"datasets/language_hr/#corpus-of-croatian-news-portals-engri-2014-2018","title":"Corpus of Croatian news portals ENGRI (2014-2018)","text":"Dataset ID: <code>croatian_news_engri</code> Title: Corpus of Croatian news portals ENGRI (2014-2018) Description: The corpus consists of texts collected from the most popular (based on the Reuters Institute Digital News Report for 2018, retrieved from http://www.digitalnewsreport.org in April, 2019) news portals in Croatia in the period from 2014 to 2018: Direktno, Dnevno, Net Hr, Hrt, Index_Hr, Jutarnji, Novilist, Rtl, SlobodnaDalmacija, Ve\u010dernji, Tportal, Dnevnik. Availibility: <code>direct_download</code> Homepage: [https://repository.pfri.uniri.hr/islandora/object/pfri%3A2156] License: Creative Commons - Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0) (commercial use: False, sharealike: True) Tokens: 695 M"},{"location":"datasets/language_hr/#croatian-web-corpus-hrwac-21","title":"Croatian web corpus hrWaC 2.1","text":"Dataset ID: <code>hrwac</code> Title: Croatian web corpus hrWaC 2.1 Description: hrWaC is a web corpus collected from the .hr top-level domain. The current version of the corpus (v2.0) contains 1.9 billion tokens and is annotated with the lemma, morphosyntax and dependency syntax layers. Availibility: <code>direct_download</code> Homepage: [http://nlp.ffzg.hr/resources/corpora/hrwac/] License: CC-BY-SA license (commercial use: True, sharealike: True) Tokens: 1 B"},{"location":"datasets/language_hr/#eurlexresources-hr","title":"EurlexResources [hr]","text":"Dataset ID: <code>eurlex_hr</code> Title: EurlexResources [hr] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 3 B"},{"location":"datasets/language_hr/#macocu-web-corpus-croatian","title":"MaCoCu web corpus [Croatian]","text":"Dataset ID: <code>macocu_hr</code> Title: MaCoCu web corpus [Croatian] Description: MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/ Availibility: <code>direct_download</code> Homepage: [https://www.clarin.si/repository/xmlui/handle/11356/1806] License: CC0-No Rights Reserved (commercial use: True, sharealike: False) Tokens: 2 B"},{"location":"datasets/language_hr/#wikibooks-hr","title":"Wikibooks [hr]","text":"Dataset ID: <code>wikibooks_hr</code> Title: Wikibooks [hr] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 538 k"},{"location":"datasets/language_hr/#wikipedia-hr","title":"Wikipedia [hr]","text":"Dataset ID: <code>wiki_hr</code> Title: Wikipedia [hr] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 65 M"},{"location":"datasets/language_hr/#wikiquote-hr","title":"Wikiquote [hr]","text":"Dataset ID: <code>wikiquote_hr</code> Title: Wikiquote [hr] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 995 k"},{"location":"datasets/language_hr/#wikisource-hr","title":"Wikisource [hr]","text":"Dataset ID: <code>wikisource_hr</code> Title: Wikisource [hr] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 20 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_hsb/","title":"Hsb Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Hsb language.</p>"},{"location":"datasets/language_hsb/#colossal-oscar-1-hsb-2015-14","title":"Colossal OSCAR 1 [hsb; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_hsb</code> Title: Colossal OSCAR 1 [hsb; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hsb/#colossal-oscar-1-hsb-2016-40","title":"Colossal OSCAR 1 [hsb; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_hsb</code> Title: Colossal OSCAR 1 [hsb; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hsb/#colossal-oscar-1-hsb-2017-43","title":"Colossal OSCAR 1 [hsb; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_hsb</code> Title: Colossal OSCAR 1 [hsb; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hsb/#colossal-oscar-1-hsb-2018-47","title":"Colossal OSCAR 1 [hsb; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_hsb</code> Title: Colossal OSCAR 1 [hsb; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hsb/#colossal-oscar-1-hsb-2019-22","title":"Colossal OSCAR 1 [hsb; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_hsb</code> Title: Colossal OSCAR 1 [hsb; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hsb/#colossal-oscar-1-hsb-2020-24","title":"Colossal OSCAR 1 [hsb; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_hsb</code> Title: Colossal OSCAR 1 [hsb; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hsb/#colossal-oscar-1-hsb-2020-45","title":"Colossal OSCAR 1 [hsb; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_hsb</code> Title: Colossal OSCAR 1 [hsb; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hsb/#colossal-oscar-1-hsb-2021-49","title":"Colossal OSCAR 1 [hsb; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_hsb</code> Title: Colossal OSCAR 1 [hsb; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hsb/#colossal-oscar-1-hsb-2022-27","title":"Colossal OSCAR 1 [hsb; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_hsb</code> Title: Colossal OSCAR 1 [hsb; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hsb/#colossal-oscar-1-hsb-2022-49","title":"Colossal OSCAR 1 [hsb; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_hsb</code> Title: Colossal OSCAR 1 [hsb; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hsb/#colossal-oscar-1-hsb-2023-14","title":"Colossal OSCAR 1 [hsb; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_hsb</code> Title: Colossal OSCAR 1 [hsb; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hsb/#colossal-oscar-1-hsb-2023-23","title":"Colossal OSCAR 1 [hsb; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_hsb</code> Title: Colossal OSCAR 1 [hsb; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ht/","title":"Haitian Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Haitian language.</p>"},{"location":"datasets/language_ht/#colossal-oscar-1-ht-2015-14","title":"Colossal OSCAR 1 [ht; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ht</code> Title: Colossal OSCAR 1 [ht; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ht/#colossal-oscar-1-ht-2016-40","title":"Colossal OSCAR 1 [ht; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ht</code> Title: Colossal OSCAR 1 [ht; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ht/#colossal-oscar-1-ht-2017-43","title":"Colossal OSCAR 1 [ht; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ht</code> Title: Colossal OSCAR 1 [ht; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ht/#colossal-oscar-1-ht-2018-47","title":"Colossal OSCAR 1 [ht; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ht</code> Title: Colossal OSCAR 1 [ht; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ht/#colossal-oscar-1-ht-2019-22","title":"Colossal OSCAR 1 [ht; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ht</code> Title: Colossal OSCAR 1 [ht; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ht/#colossal-oscar-1-ht-2020-24","title":"Colossal OSCAR 1 [ht; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ht</code> Title: Colossal OSCAR 1 [ht; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ht/#colossal-oscar-1-ht-2020-45","title":"Colossal OSCAR 1 [ht; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ht</code> Title: Colossal OSCAR 1 [ht; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ht/#colossal-oscar-1-ht-2021-49","title":"Colossal OSCAR 1 [ht; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ht</code> Title: Colossal OSCAR 1 [ht; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ht/#colossal-oscar-1-ht-2022-27","title":"Colossal OSCAR 1 [ht; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ht</code> Title: Colossal OSCAR 1 [ht; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ht/#colossal-oscar-1-ht-2022-49","title":"Colossal OSCAR 1 [ht; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ht</code> Title: Colossal OSCAR 1 [ht; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ht/#colossal-oscar-1-ht-2023-14","title":"Colossal OSCAR 1 [ht; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ht</code> Title: Colossal OSCAR 1 [ht; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ht/#colossal-oscar-1-ht-2023-23","title":"Colossal OSCAR 1 [ht; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ht</code> Title: Colossal OSCAR 1 [ht; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_hu/","title":"Hungarian Datasets","text":"<p>There are in total 20 datasets with 12 B tokens in Hungarian language.</p>"},{"location":"datasets/language_hu/#curlicat-corpus-hungarian","title":"CURLICAT Corpus [Hungarian]","text":"Dataset ID: <code>curlicat_hu</code> Title: CURLICAT Corpus [Hungarian] Description: The CURLICAT corpus includes 7 monolingual corpora (Bulgarian, Croatian, Hungarian, Polish, Romanian, Slovak and Slovenian) containing selected samples from respective national corpora. Availibility: <code>direct_download</code> Homepage: [https://elrc-share.eu/repository/browse/curlicat-hungarian-corpus/8b6c8dcb58ea11ed9c1a00155d02670679a453431c8147079e5a7d9b879a9729/] License: CC-BY-SA-4.0 (commercial use: None, sharealike: True) Tokens: 61 M"},{"location":"datasets/language_hu/#colossal-oscar-1-hu-2015-14","title":"Colossal OSCAR 1 [hu; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_hu</code> Title: Colossal OSCAR 1 [hu; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hu/#colossal-oscar-1-hu-2016-40","title":"Colossal OSCAR 1 [hu; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_hu</code> Title: Colossal OSCAR 1 [hu; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hu/#colossal-oscar-1-hu-2017-43","title":"Colossal OSCAR 1 [hu; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_hu</code> Title: Colossal OSCAR 1 [hu; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hu/#colossal-oscar-1-hu-2018-47","title":"Colossal OSCAR 1 [hu; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_hu</code> Title: Colossal OSCAR 1 [hu; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hu/#colossal-oscar-1-hu-2019-22","title":"Colossal OSCAR 1 [hu; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_hu</code> Title: Colossal OSCAR 1 [hu; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hu/#colossal-oscar-1-hu-2020-24","title":"Colossal OSCAR 1 [hu; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_hu</code> Title: Colossal OSCAR 1 [hu; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hu/#colossal-oscar-1-hu-2020-45","title":"Colossal OSCAR 1 [hu; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_hu</code> Title: Colossal OSCAR 1 [hu; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hu/#colossal-oscar-1-hu-2021-49","title":"Colossal OSCAR 1 [hu; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_hu</code> Title: Colossal OSCAR 1 [hu; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hu/#colossal-oscar-1-hu-2022-27","title":"Colossal OSCAR 1 [hu; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_hu</code> Title: Colossal OSCAR 1 [hu; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hu/#colossal-oscar-1-hu-2022-49","title":"Colossal OSCAR 1 [hu; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_hu</code> Title: Colossal OSCAR 1 [hu; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hu/#colossal-oscar-1-hu-2023-14","title":"Colossal OSCAR 1 [hu; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_hu</code> Title: Colossal OSCAR 1 [hu; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hu/#colossal-oscar-1-hu-2023-23","title":"Colossal OSCAR 1 [hu; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_hu</code> Title: Colossal OSCAR 1 [hu; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 7 B"},{"location":"datasets/language_hu/#eurlexresources-hu","title":"EurlexResources [hu]","text":"Dataset ID: <code>eurlex_hu</code> Title: EurlexResources [hu] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 4 B"},{"location":"datasets/language_hu/#legalmc4-hu","title":"LegalMC4 [hu]","text":"Dataset ID: <code>legal_mc4_hu</code> Title: LegalMC4 [hu] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 245 M"},{"location":"datasets/language_hu/#wikibooks-hu","title":"Wikibooks [hu]","text":"Dataset ID: <code>wikibooks_hu</code> Title: Wikibooks [hu] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 19 M"},{"location":"datasets/language_hu/#wikinews-hu","title":"Wikinews [hu]","text":"Dataset ID: <code>wikinews_hu</code> Title: Wikinews [hu] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 427 k"},{"location":"datasets/language_hu/#wikipedia-hu","title":"Wikipedia [hu]","text":"Dataset ID: <code>wiki_hu</code> Title: Wikipedia [hu] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 308 M"},{"location":"datasets/language_hu/#wikiquote-hu","title":"Wikiquote [hu]","text":"Dataset ID: <code>wikiquote_hu</code> Title: Wikiquote [hu] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 3 M"},{"location":"datasets/language_hu/#wikisource-hu","title":"Wikisource [hu]","text":"Dataset ID: <code>wikisource_hu</code> Title: Wikisource [hu] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 36 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_hy/","title":"Armenian Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Armenian language.</p>"},{"location":"datasets/language_hy/#colossal-oscar-1-hy-2015-14","title":"Colossal OSCAR 1 [hy; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_hy</code> Title: Colossal OSCAR 1 [hy; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hy/#colossal-oscar-1-hy-2016-40","title":"Colossal OSCAR 1 [hy; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_hy</code> Title: Colossal OSCAR 1 [hy; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hy/#colossal-oscar-1-hy-2017-43","title":"Colossal OSCAR 1 [hy; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_hy</code> Title: Colossal OSCAR 1 [hy; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hy/#colossal-oscar-1-hy-2018-47","title":"Colossal OSCAR 1 [hy; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_hy</code> Title: Colossal OSCAR 1 [hy; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hy/#colossal-oscar-1-hy-2019-22","title":"Colossal OSCAR 1 [hy; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_hy</code> Title: Colossal OSCAR 1 [hy; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hy/#colossal-oscar-1-hy-2020-24","title":"Colossal OSCAR 1 [hy; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_hy</code> Title: Colossal OSCAR 1 [hy; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hy/#colossal-oscar-1-hy-2020-45","title":"Colossal OSCAR 1 [hy; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_hy</code> Title: Colossal OSCAR 1 [hy; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hy/#colossal-oscar-1-hy-2021-49","title":"Colossal OSCAR 1 [hy; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_hy</code> Title: Colossal OSCAR 1 [hy; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hy/#colossal-oscar-1-hy-2022-27","title":"Colossal OSCAR 1 [hy; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_hy</code> Title: Colossal OSCAR 1 [hy; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hy/#colossal-oscar-1-hy-2022-49","title":"Colossal OSCAR 1 [hy; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_hy</code> Title: Colossal OSCAR 1 [hy; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hy/#colossal-oscar-1-hy-2023-14","title":"Colossal OSCAR 1 [hy; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_hy</code> Title: Colossal OSCAR 1 [hy; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_hy/#colossal-oscar-1-hy-2023-23","title":"Colossal OSCAR 1 [hy; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_hy</code> Title: Colossal OSCAR 1 [hy; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ia/","title":"Interlingua Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Interlingua language.</p>"},{"location":"datasets/language_ia/#colossal-oscar-1-ia-2015-14","title":"Colossal OSCAR 1 [ia; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ia</code> Title: Colossal OSCAR 1 [ia; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ia/#colossal-oscar-1-ia-2016-40","title":"Colossal OSCAR 1 [ia; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ia</code> Title: Colossal OSCAR 1 [ia; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ia/#colossal-oscar-1-ia-2017-43","title":"Colossal OSCAR 1 [ia; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ia</code> Title: Colossal OSCAR 1 [ia; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ia/#colossal-oscar-1-ia-2018-47","title":"Colossal OSCAR 1 [ia; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ia</code> Title: Colossal OSCAR 1 [ia; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ia/#colossal-oscar-1-ia-2019-22","title":"Colossal OSCAR 1 [ia; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ia</code> Title: Colossal OSCAR 1 [ia; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ia/#colossal-oscar-1-ia-2020-24","title":"Colossal OSCAR 1 [ia; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ia</code> Title: Colossal OSCAR 1 [ia; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ia/#colossal-oscar-1-ia-2020-45","title":"Colossal OSCAR 1 [ia; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ia</code> Title: Colossal OSCAR 1 [ia; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ia/#colossal-oscar-1-ia-2021-49","title":"Colossal OSCAR 1 [ia; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ia</code> Title: Colossal OSCAR 1 [ia; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ia/#colossal-oscar-1-ia-2022-27","title":"Colossal OSCAR 1 [ia; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ia</code> Title: Colossal OSCAR 1 [ia; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ia/#colossal-oscar-1-ia-2022-49","title":"Colossal OSCAR 1 [ia; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ia</code> Title: Colossal OSCAR 1 [ia; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ia/#colossal-oscar-1-ia-2023-14","title":"Colossal OSCAR 1 [ia; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ia</code> Title: Colossal OSCAR 1 [ia; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ia/#colossal-oscar-1-ia-2023-23","title":"Colossal OSCAR 1 [ia; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ia</code> Title: Colossal OSCAR 1 [ia; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_id/","title":"Indonesian Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Indonesian language.</p>"},{"location":"datasets/language_id/#colossal-oscar-1-id-2015-14","title":"Colossal OSCAR 1 [id; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_id</code> Title: Colossal OSCAR 1 [id; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_id/#colossal-oscar-1-id-2016-40","title":"Colossal OSCAR 1 [id; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_id</code> Title: Colossal OSCAR 1 [id; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_id/#colossal-oscar-1-id-2017-43","title":"Colossal OSCAR 1 [id; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_id</code> Title: Colossal OSCAR 1 [id; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_id/#colossal-oscar-1-id-2018-47","title":"Colossal OSCAR 1 [id; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_id</code> Title: Colossal OSCAR 1 [id; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_id/#colossal-oscar-1-id-2019-22","title":"Colossal OSCAR 1 [id; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_id</code> Title: Colossal OSCAR 1 [id; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_id/#colossal-oscar-1-id-2020-24","title":"Colossal OSCAR 1 [id; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_id</code> Title: Colossal OSCAR 1 [id; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_id/#colossal-oscar-1-id-2020-45","title":"Colossal OSCAR 1 [id; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_id</code> Title: Colossal OSCAR 1 [id; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_id/#colossal-oscar-1-id-2021-49","title":"Colossal OSCAR 1 [id; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_id</code> Title: Colossal OSCAR 1 [id; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_id/#colossal-oscar-1-id-2022-27","title":"Colossal OSCAR 1 [id; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_id</code> Title: Colossal OSCAR 1 [id; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_id/#colossal-oscar-1-id-2022-49","title":"Colossal OSCAR 1 [id; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_id</code> Title: Colossal OSCAR 1 [id; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_id/#colossal-oscar-1-id-2023-14","title":"Colossal OSCAR 1 [id; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_id</code> Title: Colossal OSCAR 1 [id; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_id/#colossal-oscar-1-id-2023-23","title":"Colossal OSCAR 1 [id; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_id</code> Title: Colossal OSCAR 1 [id; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ie/","title":"Interlingue Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Interlingue language.</p>"},{"location":"datasets/language_ie/#colossal-oscar-1-ie-2015-14","title":"Colossal OSCAR 1 [ie; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ie</code> Title: Colossal OSCAR 1 [ie; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ie/#colossal-oscar-1-ie-2016-40","title":"Colossal OSCAR 1 [ie; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ie</code> Title: Colossal OSCAR 1 [ie; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ie/#colossal-oscar-1-ie-2017-43","title":"Colossal OSCAR 1 [ie; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ie</code> Title: Colossal OSCAR 1 [ie; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ie/#colossal-oscar-1-ie-2018-47","title":"Colossal OSCAR 1 [ie; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ie</code> Title: Colossal OSCAR 1 [ie; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ie/#colossal-oscar-1-ie-2019-22","title":"Colossal OSCAR 1 [ie; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ie</code> Title: Colossal OSCAR 1 [ie; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ie/#colossal-oscar-1-ie-2020-24","title":"Colossal OSCAR 1 [ie; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ie</code> Title: Colossal OSCAR 1 [ie; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ie/#colossal-oscar-1-ie-2020-45","title":"Colossal OSCAR 1 [ie; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ie</code> Title: Colossal OSCAR 1 [ie; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ie/#colossal-oscar-1-ie-2021-49","title":"Colossal OSCAR 1 [ie; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ie</code> Title: Colossal OSCAR 1 [ie; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ie/#colossal-oscar-1-ie-2022-27","title":"Colossal OSCAR 1 [ie; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ie</code> Title: Colossal OSCAR 1 [ie; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ie/#colossal-oscar-1-ie-2022-49","title":"Colossal OSCAR 1 [ie; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ie</code> Title: Colossal OSCAR 1 [ie; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ie/#colossal-oscar-1-ie-2023-14","title":"Colossal OSCAR 1 [ie; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ie</code> Title: Colossal OSCAR 1 [ie; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ie/#colossal-oscar-1-ie-2023-23","title":"Colossal OSCAR 1 [ie; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ie</code> Title: Colossal OSCAR 1 [ie; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ig/","title":"Igbo Datasets","text":"<p>There are in total 1 datasets with N/A tokens in Igbo language.</p>"},{"location":"datasets/language_ig/#wura-igbo","title":"WURA [Igbo]","text":"Dataset ID: <code>wura_ig</code> Title: WURA [Igbo] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ilo/","title":"Ilo Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Ilo language.</p>"},{"location":"datasets/language_ilo/#colossal-oscar-1-ilo-2015-14","title":"Colossal OSCAR 1 [ilo; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ilo</code> Title: Colossal OSCAR 1 [ilo; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ilo/#colossal-oscar-1-ilo-2016-40","title":"Colossal OSCAR 1 [ilo; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ilo</code> Title: Colossal OSCAR 1 [ilo; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ilo/#colossal-oscar-1-ilo-2017-43","title":"Colossal OSCAR 1 [ilo; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ilo</code> Title: Colossal OSCAR 1 [ilo; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ilo/#colossal-oscar-1-ilo-2018-47","title":"Colossal OSCAR 1 [ilo; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ilo</code> Title: Colossal OSCAR 1 [ilo; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ilo/#colossal-oscar-1-ilo-2019-22","title":"Colossal OSCAR 1 [ilo; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ilo</code> Title: Colossal OSCAR 1 [ilo; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ilo/#colossal-oscar-1-ilo-2020-24","title":"Colossal OSCAR 1 [ilo; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ilo</code> Title: Colossal OSCAR 1 [ilo; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ilo/#colossal-oscar-1-ilo-2020-45","title":"Colossal OSCAR 1 [ilo; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ilo</code> Title: Colossal OSCAR 1 [ilo; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ilo/#colossal-oscar-1-ilo-2021-49","title":"Colossal OSCAR 1 [ilo; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ilo</code> Title: Colossal OSCAR 1 [ilo; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ilo/#colossal-oscar-1-ilo-2022-27","title":"Colossal OSCAR 1 [ilo; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ilo</code> Title: Colossal OSCAR 1 [ilo; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ilo/#colossal-oscar-1-ilo-2022-49","title":"Colossal OSCAR 1 [ilo; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ilo</code> Title: Colossal OSCAR 1 [ilo; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ilo/#colossal-oscar-1-ilo-2023-14","title":"Colossal OSCAR 1 [ilo; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ilo</code> Title: Colossal OSCAR 1 [ilo; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ilo/#colossal-oscar-1-ilo-2023-23","title":"Colossal OSCAR 1 [ilo; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ilo</code> Title: Colossal OSCAR 1 [ilo; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_io/","title":"Ido Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Ido language.</p>"},{"location":"datasets/language_io/#colossal-oscar-1-io-2015-14","title":"Colossal OSCAR 1 [io; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_io</code> Title: Colossal OSCAR 1 [io; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_io/#colossal-oscar-1-io-2016-40","title":"Colossal OSCAR 1 [io; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_io</code> Title: Colossal OSCAR 1 [io; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_io/#colossal-oscar-1-io-2017-43","title":"Colossal OSCAR 1 [io; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_io</code> Title: Colossal OSCAR 1 [io; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_io/#colossal-oscar-1-io-2018-47","title":"Colossal OSCAR 1 [io; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_io</code> Title: Colossal OSCAR 1 [io; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_io/#colossal-oscar-1-io-2019-22","title":"Colossal OSCAR 1 [io; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_io</code> Title: Colossal OSCAR 1 [io; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_io/#colossal-oscar-1-io-2020-24","title":"Colossal OSCAR 1 [io; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_io</code> Title: Colossal OSCAR 1 [io; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_io/#colossal-oscar-1-io-2020-45","title":"Colossal OSCAR 1 [io; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_io</code> Title: Colossal OSCAR 1 [io; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_io/#colossal-oscar-1-io-2021-49","title":"Colossal OSCAR 1 [io; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_io</code> Title: Colossal OSCAR 1 [io; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_io/#colossal-oscar-1-io-2022-27","title":"Colossal OSCAR 1 [io; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_io</code> Title: Colossal OSCAR 1 [io; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_io/#colossal-oscar-1-io-2022-49","title":"Colossal OSCAR 1 [io; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_io</code> Title: Colossal OSCAR 1 [io; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_io/#colossal-oscar-1-io-2023-14","title":"Colossal OSCAR 1 [io; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_io</code> Title: Colossal OSCAR 1 [io; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_io/#colossal-oscar-1-io-2023-23","title":"Colossal OSCAR 1 [io; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_io</code> Title: Colossal OSCAR 1 [io; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_is/","title":"Icelandic Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Icelandic language.</p>"},{"location":"datasets/language_is/#colossal-oscar-1-is-2015-14","title":"Colossal OSCAR 1 [is; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_is</code> Title: Colossal OSCAR 1 [is; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_is/#colossal-oscar-1-is-2016-40","title":"Colossal OSCAR 1 [is; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_is</code> Title: Colossal OSCAR 1 [is; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_is/#colossal-oscar-1-is-2017-43","title":"Colossal OSCAR 1 [is; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_is</code> Title: Colossal OSCAR 1 [is; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_is/#colossal-oscar-1-is-2018-47","title":"Colossal OSCAR 1 [is; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_is</code> Title: Colossal OSCAR 1 [is; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_is/#colossal-oscar-1-is-2019-22","title":"Colossal OSCAR 1 [is; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_is</code> Title: Colossal OSCAR 1 [is; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_is/#colossal-oscar-1-is-2020-24","title":"Colossal OSCAR 1 [is; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_is</code> Title: Colossal OSCAR 1 [is; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_is/#colossal-oscar-1-is-2020-45","title":"Colossal OSCAR 1 [is; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_is</code> Title: Colossal OSCAR 1 [is; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_is/#colossal-oscar-1-is-2021-49","title":"Colossal OSCAR 1 [is; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_is</code> Title: Colossal OSCAR 1 [is; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_is/#colossal-oscar-1-is-2022-27","title":"Colossal OSCAR 1 [is; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_is</code> Title: Colossal OSCAR 1 [is; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_is/#colossal-oscar-1-is-2022-49","title":"Colossal OSCAR 1 [is; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_is</code> Title: Colossal OSCAR 1 [is; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_is/#colossal-oscar-1-is-2023-14","title":"Colossal OSCAR 1 [is; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_is</code> Title: Colossal OSCAR 1 [is; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_is/#colossal-oscar-1-is-2023-23","title":"Colossal OSCAR 1 [is; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_is</code> Title: Colossal OSCAR 1 [is; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_it/","title":"Italian Datasets","text":"<p>There are in total 21 datasets with 14 B tokens in Italian language.</p>"},{"location":"datasets/language_it/#colossal-oscar-1-it-2015-14","title":"Colossal OSCAR 1 [it; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_it</code> Title: Colossal OSCAR 1 [it; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_it/#colossal-oscar-1-it-2016-40","title":"Colossal OSCAR 1 [it; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_it</code> Title: Colossal OSCAR 1 [it; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_it/#colossal-oscar-1-it-2017-43","title":"Colossal OSCAR 1 [it; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_it</code> Title: Colossal OSCAR 1 [it; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_it/#colossal-oscar-1-it-2018-47","title":"Colossal OSCAR 1 [it; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_it</code> Title: Colossal OSCAR 1 [it; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_it/#colossal-oscar-1-it-2019-22","title":"Colossal OSCAR 1 [it; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_it</code> Title: Colossal OSCAR 1 [it; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_it/#colossal-oscar-1-it-2020-24","title":"Colossal OSCAR 1 [it; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_it</code> Title: Colossal OSCAR 1 [it; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_it/#colossal-oscar-1-it-2020-45","title":"Colossal OSCAR 1 [it; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_it</code> Title: Colossal OSCAR 1 [it; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_it/#colossal-oscar-1-it-2021-49","title":"Colossal OSCAR 1 [it; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_it</code> Title: Colossal OSCAR 1 [it; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_it/#colossal-oscar-1-it-2022-27","title":"Colossal OSCAR 1 [it; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_it</code> Title: Colossal OSCAR 1 [it; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_it/#colossal-oscar-1-it-2022-49","title":"Colossal OSCAR 1 [it; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_it</code> Title: Colossal OSCAR 1 [it; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_it/#colossal-oscar-1-it-2023-14","title":"Colossal OSCAR 1 [it; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_it</code> Title: Colossal OSCAR 1 [it; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_it/#colossal-oscar-1-it-2023-23","title":"Colossal OSCAR 1 [it; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_it</code> Title: Colossal OSCAR 1 [it; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_it/#eurlexresources-it","title":"EurlexResources [it]","text":"Dataset ID: <code>eurlex_it</code> Title: EurlexResources [it] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 8 B"},{"location":"datasets/language_it/#itwac","title":"ITWaC","text":"Dataset ID: <code>itwac</code> Title: ITWaC Description: itWaC: a 2 billion word corpus constructed from the Web limiting the crawl to the .it domain and using medium-frequency words from the Repubblica corpus and basic Italian vocabulary lists as seeds. Availibility: <code>on_request</code> Homepage: [https://docs.sslmit.unibo.it/doku.php?id=corpora:itwac] License: unknown, likely research-only or fair use (commercial use: None, sharealike: None) Tokens: 2 B"},{"location":"datasets/language_it/#legalmc4-it","title":"LegalMC4 [it]","text":"Dataset ID: <code>legal_mc4_it</code> Title: LegalMC4 [it] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 3 B"},{"location":"datasets/language_it/#wikibooks-it","title":"Wikibooks [it]","text":"Dataset ID: <code>wikibooks_it</code> Title: Wikibooks [it] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 29 M"},{"location":"datasets/language_it/#wikinews-it","title":"Wikinews [it]","text":"Dataset ID: <code>wikinews_it</code> Title: Wikinews [it] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 5 M"},{"location":"datasets/language_it/#wikipedia-it","title":"Wikipedia [it]","text":"Dataset ID: <code>wiki_it</code> Title: Wikipedia [it] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 821 M"},{"location":"datasets/language_it/#wikiquote-it","title":"Wikiquote [it]","text":"Dataset ID: <code>wikiquote_it</code> Title: Wikiquote [it] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 55 M"},{"location":"datasets/language_it/#wikisource-it","title":"Wikisource [it]","text":"Dataset ID: <code>wikisource_it</code> Title: Wikisource [it] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 66 M"},{"location":"datasets/language_it/#wikivoyage-it","title":"Wikivoyage [it]","text":"Dataset ID: <code>wikivoyage_it</code> Title: Wikivoyage [it] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 13 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_ja/","title":"Japanese Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Japanese language.</p>"},{"location":"datasets/language_ja/#colossal-oscar-1-ja-2015-14","title":"Colossal OSCAR 1 [ja; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ja</code> Title: Colossal OSCAR 1 [ja; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ja/#colossal-oscar-1-ja-2016-40","title":"Colossal OSCAR 1 [ja; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ja</code> Title: Colossal OSCAR 1 [ja; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ja/#colossal-oscar-1-ja-2017-43","title":"Colossal OSCAR 1 [ja; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ja</code> Title: Colossal OSCAR 1 [ja; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ja/#colossal-oscar-1-ja-2018-47","title":"Colossal OSCAR 1 [ja; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ja</code> Title: Colossal OSCAR 1 [ja; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ja/#colossal-oscar-1-ja-2019-22","title":"Colossal OSCAR 1 [ja; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ja</code> Title: Colossal OSCAR 1 [ja; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ja/#colossal-oscar-1-ja-2020-24","title":"Colossal OSCAR 1 [ja; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ja</code> Title: Colossal OSCAR 1 [ja; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ja/#colossal-oscar-1-ja-2020-45","title":"Colossal OSCAR 1 [ja; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ja</code> Title: Colossal OSCAR 1 [ja; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ja/#colossal-oscar-1-ja-2021-49","title":"Colossal OSCAR 1 [ja; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ja</code> Title: Colossal OSCAR 1 [ja; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ja/#colossal-oscar-1-ja-2022-27","title":"Colossal OSCAR 1 [ja; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ja</code> Title: Colossal OSCAR 1 [ja; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ja/#colossal-oscar-1-ja-2022-49","title":"Colossal OSCAR 1 [ja; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ja</code> Title: Colossal OSCAR 1 [ja; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ja/#colossal-oscar-1-ja-2023-14","title":"Colossal OSCAR 1 [ja; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ja</code> Title: Colossal OSCAR 1 [ja; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ja/#colossal-oscar-1-ja-2023-23","title":"Colossal OSCAR 1 [ja; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ja</code> Title: Colossal OSCAR 1 [ja; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_jbo/","title":"Jbo Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Jbo language.</p>"},{"location":"datasets/language_jbo/#colossal-oscar-1-jbo-2015-14","title":"Colossal OSCAR 1 [jbo; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_jbo</code> Title: Colossal OSCAR 1 [jbo; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jbo/#colossal-oscar-1-jbo-2016-40","title":"Colossal OSCAR 1 [jbo; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_jbo</code> Title: Colossal OSCAR 1 [jbo; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jbo/#colossal-oscar-1-jbo-2017-43","title":"Colossal OSCAR 1 [jbo; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_jbo</code> Title: Colossal OSCAR 1 [jbo; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jbo/#colossal-oscar-1-jbo-2018-47","title":"Colossal OSCAR 1 [jbo; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_jbo</code> Title: Colossal OSCAR 1 [jbo; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jbo/#colossal-oscar-1-jbo-2019-22","title":"Colossal OSCAR 1 [jbo; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_jbo</code> Title: Colossal OSCAR 1 [jbo; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jbo/#colossal-oscar-1-jbo-2020-24","title":"Colossal OSCAR 1 [jbo; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_jbo</code> Title: Colossal OSCAR 1 [jbo; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jbo/#colossal-oscar-1-jbo-2020-45","title":"Colossal OSCAR 1 [jbo; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_jbo</code> Title: Colossal OSCAR 1 [jbo; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jbo/#colossal-oscar-1-jbo-2021-49","title":"Colossal OSCAR 1 [jbo; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_jbo</code> Title: Colossal OSCAR 1 [jbo; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jbo/#colossal-oscar-1-jbo-2022-27","title":"Colossal OSCAR 1 [jbo; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_jbo</code> Title: Colossal OSCAR 1 [jbo; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jbo/#colossal-oscar-1-jbo-2022-49","title":"Colossal OSCAR 1 [jbo; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_jbo</code> Title: Colossal OSCAR 1 [jbo; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jbo/#colossal-oscar-1-jbo-2023-14","title":"Colossal OSCAR 1 [jbo; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_jbo</code> Title: Colossal OSCAR 1 [jbo; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jbo/#colossal-oscar-1-jbo-2023-23","title":"Colossal OSCAR 1 [jbo; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_jbo</code> Title: Colossal OSCAR 1 [jbo; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_jv/","title":"Javanese Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Javanese language.</p>"},{"location":"datasets/language_jv/#colossal-oscar-1-jv-2015-14","title":"Colossal OSCAR 1 [jv; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_jv</code> Title: Colossal OSCAR 1 [jv; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jv/#colossal-oscar-1-jv-2016-40","title":"Colossal OSCAR 1 [jv; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_jv</code> Title: Colossal OSCAR 1 [jv; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jv/#colossal-oscar-1-jv-2017-43","title":"Colossal OSCAR 1 [jv; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_jv</code> Title: Colossal OSCAR 1 [jv; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jv/#colossal-oscar-1-jv-2018-47","title":"Colossal OSCAR 1 [jv; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_jv</code> Title: Colossal OSCAR 1 [jv; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jv/#colossal-oscar-1-jv-2019-22","title":"Colossal OSCAR 1 [jv; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_jv</code> Title: Colossal OSCAR 1 [jv; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jv/#colossal-oscar-1-jv-2020-24","title":"Colossal OSCAR 1 [jv; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_jv</code> Title: Colossal OSCAR 1 [jv; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jv/#colossal-oscar-1-jv-2020-45","title":"Colossal OSCAR 1 [jv; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_jv</code> Title: Colossal OSCAR 1 [jv; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jv/#colossal-oscar-1-jv-2021-49","title":"Colossal OSCAR 1 [jv; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_jv</code> Title: Colossal OSCAR 1 [jv; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jv/#colossal-oscar-1-jv-2022-27","title":"Colossal OSCAR 1 [jv; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_jv</code> Title: Colossal OSCAR 1 [jv; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jv/#colossal-oscar-1-jv-2022-49","title":"Colossal OSCAR 1 [jv; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_jv</code> Title: Colossal OSCAR 1 [jv; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jv/#colossal-oscar-1-jv-2023-14","title":"Colossal OSCAR 1 [jv; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_jv</code> Title: Colossal OSCAR 1 [jv; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_jv/#colossal-oscar-1-jv-2023-23","title":"Colossal OSCAR 1 [jv; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_jv</code> Title: Colossal OSCAR 1 [jv; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ka/","title":"Georgian Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Georgian language.</p>"},{"location":"datasets/language_ka/#colossal-oscar-1-ka-2015-14","title":"Colossal OSCAR 1 [ka; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ka</code> Title: Colossal OSCAR 1 [ka; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ka/#colossal-oscar-1-ka-2016-40","title":"Colossal OSCAR 1 [ka; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ka</code> Title: Colossal OSCAR 1 [ka; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ka/#colossal-oscar-1-ka-2017-43","title":"Colossal OSCAR 1 [ka; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ka</code> Title: Colossal OSCAR 1 [ka; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ka/#colossal-oscar-1-ka-2018-47","title":"Colossal OSCAR 1 [ka; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ka</code> Title: Colossal OSCAR 1 [ka; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ka/#colossal-oscar-1-ka-2019-22","title":"Colossal OSCAR 1 [ka; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ka</code> Title: Colossal OSCAR 1 [ka; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ka/#colossal-oscar-1-ka-2020-24","title":"Colossal OSCAR 1 [ka; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ka</code> Title: Colossal OSCAR 1 [ka; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ka/#colossal-oscar-1-ka-2020-45","title":"Colossal OSCAR 1 [ka; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ka</code> Title: Colossal OSCAR 1 [ka; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ka/#colossal-oscar-1-ka-2021-49","title":"Colossal OSCAR 1 [ka; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ka</code> Title: Colossal OSCAR 1 [ka; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ka/#colossal-oscar-1-ka-2022-27","title":"Colossal OSCAR 1 [ka; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ka</code> Title: Colossal OSCAR 1 [ka; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ka/#colossal-oscar-1-ka-2022-49","title":"Colossal OSCAR 1 [ka; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ka</code> Title: Colossal OSCAR 1 [ka; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ka/#colossal-oscar-1-ka-2023-14","title":"Colossal OSCAR 1 [ka; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ka</code> Title: Colossal OSCAR 1 [ka; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ka/#colossal-oscar-1-ka-2023-23","title":"Colossal OSCAR 1 [ka; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ka</code> Title: Colossal OSCAR 1 [ka; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_kk/","title":"Kazakh Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Kazakh language.</p>"},{"location":"datasets/language_kk/#colossal-oscar-1-kk-2015-14","title":"Colossal OSCAR 1 [kk; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_kk</code> Title: Colossal OSCAR 1 [kk; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kk/#colossal-oscar-1-kk-2016-40","title":"Colossal OSCAR 1 [kk; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_kk</code> Title: Colossal OSCAR 1 [kk; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kk/#colossal-oscar-1-kk-2017-43","title":"Colossal OSCAR 1 [kk; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_kk</code> Title: Colossal OSCAR 1 [kk; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kk/#colossal-oscar-1-kk-2018-47","title":"Colossal OSCAR 1 [kk; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_kk</code> Title: Colossal OSCAR 1 [kk; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kk/#colossal-oscar-1-kk-2019-22","title":"Colossal OSCAR 1 [kk; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_kk</code> Title: Colossal OSCAR 1 [kk; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kk/#colossal-oscar-1-kk-2020-24","title":"Colossal OSCAR 1 [kk; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_kk</code> Title: Colossal OSCAR 1 [kk; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kk/#colossal-oscar-1-kk-2020-45","title":"Colossal OSCAR 1 [kk; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_kk</code> Title: Colossal OSCAR 1 [kk; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kk/#colossal-oscar-1-kk-2021-49","title":"Colossal OSCAR 1 [kk; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_kk</code> Title: Colossal OSCAR 1 [kk; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kk/#colossal-oscar-1-kk-2022-27","title":"Colossal OSCAR 1 [kk; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_kk</code> Title: Colossal OSCAR 1 [kk; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kk/#colossal-oscar-1-kk-2022-49","title":"Colossal OSCAR 1 [kk; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_kk</code> Title: Colossal OSCAR 1 [kk; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kk/#colossal-oscar-1-kk-2023-14","title":"Colossal OSCAR 1 [kk; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_kk</code> Title: Colossal OSCAR 1 [kk; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kk/#colossal-oscar-1-kk-2023-23","title":"Colossal OSCAR 1 [kk; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_kk</code> Title: Colossal OSCAR 1 [kk; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_km/","title":"Khmer Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Khmer language.</p>"},{"location":"datasets/language_km/#colossal-oscar-1-km-2015-14","title":"Colossal OSCAR 1 [km; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_km</code> Title: Colossal OSCAR 1 [km; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_km/#colossal-oscar-1-km-2016-40","title":"Colossal OSCAR 1 [km; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_km</code> Title: Colossal OSCAR 1 [km; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_km/#colossal-oscar-1-km-2017-43","title":"Colossal OSCAR 1 [km; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_km</code> Title: Colossal OSCAR 1 [km; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_km/#colossal-oscar-1-km-2018-47","title":"Colossal OSCAR 1 [km; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_km</code> Title: Colossal OSCAR 1 [km; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_km/#colossal-oscar-1-km-2019-22","title":"Colossal OSCAR 1 [km; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_km</code> Title: Colossal OSCAR 1 [km; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_km/#colossal-oscar-1-km-2020-24","title":"Colossal OSCAR 1 [km; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_km</code> Title: Colossal OSCAR 1 [km; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_km/#colossal-oscar-1-km-2020-45","title":"Colossal OSCAR 1 [km; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_km</code> Title: Colossal OSCAR 1 [km; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_km/#colossal-oscar-1-km-2021-49","title":"Colossal OSCAR 1 [km; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_km</code> Title: Colossal OSCAR 1 [km; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_km/#colossal-oscar-1-km-2022-27","title":"Colossal OSCAR 1 [km; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_km</code> Title: Colossal OSCAR 1 [km; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_km/#colossal-oscar-1-km-2022-49","title":"Colossal OSCAR 1 [km; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_km</code> Title: Colossal OSCAR 1 [km; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_km/#colossal-oscar-1-km-2023-14","title":"Colossal OSCAR 1 [km; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_km</code> Title: Colossal OSCAR 1 [km; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_km/#colossal-oscar-1-km-2023-23","title":"Colossal OSCAR 1 [km; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_km</code> Title: Colossal OSCAR 1 [km; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_kn/","title":"Kannada Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Kannada language.</p>"},{"location":"datasets/language_kn/#colossal-oscar-1-kn-2015-14","title":"Colossal OSCAR 1 [kn; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_kn</code> Title: Colossal OSCAR 1 [kn; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kn/#colossal-oscar-1-kn-2016-40","title":"Colossal OSCAR 1 [kn; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_kn</code> Title: Colossal OSCAR 1 [kn; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kn/#colossal-oscar-1-kn-2017-43","title":"Colossal OSCAR 1 [kn; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_kn</code> Title: Colossal OSCAR 1 [kn; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kn/#colossal-oscar-1-kn-2018-47","title":"Colossal OSCAR 1 [kn; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_kn</code> Title: Colossal OSCAR 1 [kn; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kn/#colossal-oscar-1-kn-2019-22","title":"Colossal OSCAR 1 [kn; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_kn</code> Title: Colossal OSCAR 1 [kn; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kn/#colossal-oscar-1-kn-2020-24","title":"Colossal OSCAR 1 [kn; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_kn</code> Title: Colossal OSCAR 1 [kn; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kn/#colossal-oscar-1-kn-2020-45","title":"Colossal OSCAR 1 [kn; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_kn</code> Title: Colossal OSCAR 1 [kn; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kn/#colossal-oscar-1-kn-2021-49","title":"Colossal OSCAR 1 [kn; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_kn</code> Title: Colossal OSCAR 1 [kn; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kn/#colossal-oscar-1-kn-2022-27","title":"Colossal OSCAR 1 [kn; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_kn</code> Title: Colossal OSCAR 1 [kn; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kn/#colossal-oscar-1-kn-2022-49","title":"Colossal OSCAR 1 [kn; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_kn</code> Title: Colossal OSCAR 1 [kn; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kn/#colossal-oscar-1-kn-2023-14","title":"Colossal OSCAR 1 [kn; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_kn</code> Title: Colossal OSCAR 1 [kn; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kn/#colossal-oscar-1-kn-2023-23","title":"Colossal OSCAR 1 [kn; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_kn</code> Title: Colossal OSCAR 1 [kn; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ko/","title":"Korean Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Korean language.</p>"},{"location":"datasets/language_ko/#colossal-oscar-1-ko-2015-14","title":"Colossal OSCAR 1 [ko; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ko</code> Title: Colossal OSCAR 1 [ko; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ko/#colossal-oscar-1-ko-2016-40","title":"Colossal OSCAR 1 [ko; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ko</code> Title: Colossal OSCAR 1 [ko; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ko/#colossal-oscar-1-ko-2017-43","title":"Colossal OSCAR 1 [ko; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ko</code> Title: Colossal OSCAR 1 [ko; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ko/#colossal-oscar-1-ko-2018-47","title":"Colossal OSCAR 1 [ko; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ko</code> Title: Colossal OSCAR 1 [ko; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ko/#colossal-oscar-1-ko-2019-22","title":"Colossal OSCAR 1 [ko; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ko</code> Title: Colossal OSCAR 1 [ko; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ko/#colossal-oscar-1-ko-2020-24","title":"Colossal OSCAR 1 [ko; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ko</code> Title: Colossal OSCAR 1 [ko; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ko/#colossal-oscar-1-ko-2020-45","title":"Colossal OSCAR 1 [ko; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ko</code> Title: Colossal OSCAR 1 [ko; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ko/#colossal-oscar-1-ko-2021-49","title":"Colossal OSCAR 1 [ko; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ko</code> Title: Colossal OSCAR 1 [ko; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ko/#colossal-oscar-1-ko-2022-27","title":"Colossal OSCAR 1 [ko; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ko</code> Title: Colossal OSCAR 1 [ko; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ko/#colossal-oscar-1-ko-2022-49","title":"Colossal OSCAR 1 [ko; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ko</code> Title: Colossal OSCAR 1 [ko; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ko/#colossal-oscar-1-ko-2023-14","title":"Colossal OSCAR 1 [ko; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ko</code> Title: Colossal OSCAR 1 [ko; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ko/#colossal-oscar-1-ko-2023-23","title":"Colossal OSCAR 1 [ko; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ko</code> Title: Colossal OSCAR 1 [ko; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_krc/","title":"Krc Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Krc language.</p>"},{"location":"datasets/language_krc/#colossal-oscar-1-krc-2015-14","title":"Colossal OSCAR 1 [krc; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_krc</code> Title: Colossal OSCAR 1 [krc; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_krc/#colossal-oscar-1-krc-2016-40","title":"Colossal OSCAR 1 [krc; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_krc</code> Title: Colossal OSCAR 1 [krc; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_krc/#colossal-oscar-1-krc-2017-43","title":"Colossal OSCAR 1 [krc; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_krc</code> Title: Colossal OSCAR 1 [krc; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_krc/#colossal-oscar-1-krc-2018-47","title":"Colossal OSCAR 1 [krc; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_krc</code> Title: Colossal OSCAR 1 [krc; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_krc/#colossal-oscar-1-krc-2019-22","title":"Colossal OSCAR 1 [krc; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_krc</code> Title: Colossal OSCAR 1 [krc; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_krc/#colossal-oscar-1-krc-2020-24","title":"Colossal OSCAR 1 [krc; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_krc</code> Title: Colossal OSCAR 1 [krc; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_krc/#colossal-oscar-1-krc-2020-45","title":"Colossal OSCAR 1 [krc; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_krc</code> Title: Colossal OSCAR 1 [krc; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_krc/#colossal-oscar-1-krc-2021-49","title":"Colossal OSCAR 1 [krc; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_krc</code> Title: Colossal OSCAR 1 [krc; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_krc/#colossal-oscar-1-krc-2022-27","title":"Colossal OSCAR 1 [krc; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_krc</code> Title: Colossal OSCAR 1 [krc; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_krc/#colossal-oscar-1-krc-2022-49","title":"Colossal OSCAR 1 [krc; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_krc</code> Title: Colossal OSCAR 1 [krc; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_krc/#colossal-oscar-1-krc-2023-14","title":"Colossal OSCAR 1 [krc; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_krc</code> Title: Colossal OSCAR 1 [krc; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_krc/#colossal-oscar-1-krc-2023-23","title":"Colossal OSCAR 1 [krc; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_krc</code> Title: Colossal OSCAR 1 [krc; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ku/","title":"Kurdish Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Kurdish language.</p>"},{"location":"datasets/language_ku/#colossal-oscar-1-ku-2015-14","title":"Colossal OSCAR 1 [ku; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ku</code> Title: Colossal OSCAR 1 [ku; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ku/#colossal-oscar-1-ku-2016-40","title":"Colossal OSCAR 1 [ku; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ku</code> Title: Colossal OSCAR 1 [ku; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ku/#colossal-oscar-1-ku-2017-43","title":"Colossal OSCAR 1 [ku; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ku</code> Title: Colossal OSCAR 1 [ku; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ku/#colossal-oscar-1-ku-2018-47","title":"Colossal OSCAR 1 [ku; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ku</code> Title: Colossal OSCAR 1 [ku; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ku/#colossal-oscar-1-ku-2019-22","title":"Colossal OSCAR 1 [ku; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ku</code> Title: Colossal OSCAR 1 [ku; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ku/#colossal-oscar-1-ku-2020-24","title":"Colossal OSCAR 1 [ku; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ku</code> Title: Colossal OSCAR 1 [ku; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ku/#colossal-oscar-1-ku-2020-45","title":"Colossal OSCAR 1 [ku; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ku</code> Title: Colossal OSCAR 1 [ku; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ku/#colossal-oscar-1-ku-2021-49","title":"Colossal OSCAR 1 [ku; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ku</code> Title: Colossal OSCAR 1 [ku; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ku/#colossal-oscar-1-ku-2022-27","title":"Colossal OSCAR 1 [ku; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ku</code> Title: Colossal OSCAR 1 [ku; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ku/#colossal-oscar-1-ku-2022-49","title":"Colossal OSCAR 1 [ku; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ku</code> Title: Colossal OSCAR 1 [ku; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ku/#colossal-oscar-1-ku-2023-14","title":"Colossal OSCAR 1 [ku; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ku</code> Title: Colossal OSCAR 1 [ku; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ku/#colossal-oscar-1-ku-2023-23","title":"Colossal OSCAR 1 [ku; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ku</code> Title: Colossal OSCAR 1 [ku; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_kv/","title":"Komi Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Komi language.</p>"},{"location":"datasets/language_kv/#colossal-oscar-1-kv-2015-14","title":"Colossal OSCAR 1 [kv; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_kv</code> Title: Colossal OSCAR 1 [kv; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kv/#colossal-oscar-1-kv-2016-40","title":"Colossal OSCAR 1 [kv; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_kv</code> Title: Colossal OSCAR 1 [kv; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kv/#colossal-oscar-1-kv-2017-43","title":"Colossal OSCAR 1 [kv; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_kv</code> Title: Colossal OSCAR 1 [kv; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kv/#colossal-oscar-1-kv-2018-47","title":"Colossal OSCAR 1 [kv; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_kv</code> Title: Colossal OSCAR 1 [kv; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kv/#colossal-oscar-1-kv-2019-22","title":"Colossal OSCAR 1 [kv; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_kv</code> Title: Colossal OSCAR 1 [kv; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kv/#colossal-oscar-1-kv-2020-24","title":"Colossal OSCAR 1 [kv; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_kv</code> Title: Colossal OSCAR 1 [kv; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kv/#colossal-oscar-1-kv-2020-45","title":"Colossal OSCAR 1 [kv; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_kv</code> Title: Colossal OSCAR 1 [kv; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kv/#colossal-oscar-1-kv-2021-49","title":"Colossal OSCAR 1 [kv; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_kv</code> Title: Colossal OSCAR 1 [kv; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kv/#colossal-oscar-1-kv-2022-27","title":"Colossal OSCAR 1 [kv; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_kv</code> Title: Colossal OSCAR 1 [kv; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kv/#colossal-oscar-1-kv-2022-49","title":"Colossal OSCAR 1 [kv; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_kv</code> Title: Colossal OSCAR 1 [kv; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kv/#colossal-oscar-1-kv-2023-14","title":"Colossal OSCAR 1 [kv; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_kv</code> Title: Colossal OSCAR 1 [kv; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kv/#colossal-oscar-1-kv-2023-23","title":"Colossal OSCAR 1 [kv; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_kv</code> Title: Colossal OSCAR 1 [kv; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_kw/","title":"Cornish Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Cornish language.</p>"},{"location":"datasets/language_kw/#colossal-oscar-1-kw-2015-14","title":"Colossal OSCAR 1 [kw; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_kw</code> Title: Colossal OSCAR 1 [kw; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kw/#colossal-oscar-1-kw-2016-40","title":"Colossal OSCAR 1 [kw; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_kw</code> Title: Colossal OSCAR 1 [kw; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kw/#colossal-oscar-1-kw-2017-43","title":"Colossal OSCAR 1 [kw; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_kw</code> Title: Colossal OSCAR 1 [kw; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kw/#colossal-oscar-1-kw-2018-47","title":"Colossal OSCAR 1 [kw; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_kw</code> Title: Colossal OSCAR 1 [kw; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kw/#colossal-oscar-1-kw-2019-22","title":"Colossal OSCAR 1 [kw; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_kw</code> Title: Colossal OSCAR 1 [kw; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kw/#colossal-oscar-1-kw-2020-24","title":"Colossal OSCAR 1 [kw; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_kw</code> Title: Colossal OSCAR 1 [kw; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kw/#colossal-oscar-1-kw-2020-45","title":"Colossal OSCAR 1 [kw; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_kw</code> Title: Colossal OSCAR 1 [kw; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kw/#colossal-oscar-1-kw-2021-49","title":"Colossal OSCAR 1 [kw; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_kw</code> Title: Colossal OSCAR 1 [kw; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kw/#colossal-oscar-1-kw-2022-27","title":"Colossal OSCAR 1 [kw; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_kw</code> Title: Colossal OSCAR 1 [kw; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kw/#colossal-oscar-1-kw-2022-49","title":"Colossal OSCAR 1 [kw; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_kw</code> Title: Colossal OSCAR 1 [kw; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kw/#colossal-oscar-1-kw-2023-14","title":"Colossal OSCAR 1 [kw; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_kw</code> Title: Colossal OSCAR 1 [kw; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_kw/#colossal-oscar-1-kw-2023-23","title":"Colossal OSCAR 1 [kw; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_kw</code> Title: Colossal OSCAR 1 [kw; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ky/","title":"Kirghiz Datasets","text":"<p>There are in total 13 datasets with N/A tokens in Kirghiz language.</p>"},{"location":"datasets/language_ky/#colossal-oscar-1-ky-2015-14","title":"Colossal OSCAR 1 [ky; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ky</code> Title: Colossal OSCAR 1 [ky; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ky/#colossal-oscar-1-ky-2016-40","title":"Colossal OSCAR 1 [ky; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ky</code> Title: Colossal OSCAR 1 [ky; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ky/#colossal-oscar-1-ky-2017-43","title":"Colossal OSCAR 1 [ky; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ky</code> Title: Colossal OSCAR 1 [ky; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ky/#colossal-oscar-1-ky-2018-47","title":"Colossal OSCAR 1 [ky; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ky</code> Title: Colossal OSCAR 1 [ky; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ky/#colossal-oscar-1-ky-2019-22","title":"Colossal OSCAR 1 [ky; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ky</code> Title: Colossal OSCAR 1 [ky; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ky/#colossal-oscar-1-ky-2020-24","title":"Colossal OSCAR 1 [ky; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ky</code> Title: Colossal OSCAR 1 [ky; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ky/#colossal-oscar-1-ky-2020-45","title":"Colossal OSCAR 1 [ky; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ky</code> Title: Colossal OSCAR 1 [ky; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ky/#colossal-oscar-1-ky-2021-49","title":"Colossal OSCAR 1 [ky; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ky</code> Title: Colossal OSCAR 1 [ky; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ky/#colossal-oscar-1-ky-2022-27","title":"Colossal OSCAR 1 [ky; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ky</code> Title: Colossal OSCAR 1 [ky; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ky/#colossal-oscar-1-ky-2022-49","title":"Colossal OSCAR 1 [ky; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ky</code> Title: Colossal OSCAR 1 [ky; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ky/#colossal-oscar-1-ky-2023-14","title":"Colossal OSCAR 1 [ky; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ky</code> Title: Colossal OSCAR 1 [ky; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ky/#colossal-oscar-1-ky-2023-23","title":"Colossal OSCAR 1 [ky; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ky</code> Title: Colossal OSCAR 1 [ky; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ky/#wura-kirghiz","title":"WURA [Kirghiz]","text":"Dataset ID: <code>wura_mg</code> Title: WURA [Kirghiz] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_la/","title":"Latin Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Latin language.</p>"},{"location":"datasets/language_la/#colossal-oscar-1-la-2015-14","title":"Colossal OSCAR 1 [la; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_la</code> Title: Colossal OSCAR 1 [la; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_la/#colossal-oscar-1-la-2016-40","title":"Colossal OSCAR 1 [la; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_la</code> Title: Colossal OSCAR 1 [la; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_la/#colossal-oscar-1-la-2017-43","title":"Colossal OSCAR 1 [la; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_la</code> Title: Colossal OSCAR 1 [la; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_la/#colossal-oscar-1-la-2018-47","title":"Colossal OSCAR 1 [la; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_la</code> Title: Colossal OSCAR 1 [la; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_la/#colossal-oscar-1-la-2019-22","title":"Colossal OSCAR 1 [la; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_la</code> Title: Colossal OSCAR 1 [la; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_la/#colossal-oscar-1-la-2020-24","title":"Colossal OSCAR 1 [la; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_la</code> Title: Colossal OSCAR 1 [la; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_la/#colossal-oscar-1-la-2020-45","title":"Colossal OSCAR 1 [la; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_la</code> Title: Colossal OSCAR 1 [la; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_la/#colossal-oscar-1-la-2021-49","title":"Colossal OSCAR 1 [la; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_la</code> Title: Colossal OSCAR 1 [la; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_la/#colossal-oscar-1-la-2022-27","title":"Colossal OSCAR 1 [la; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_la</code> Title: Colossal OSCAR 1 [la; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_la/#colossal-oscar-1-la-2022-49","title":"Colossal OSCAR 1 [la; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_la</code> Title: Colossal OSCAR 1 [la; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_la/#colossal-oscar-1-la-2023-14","title":"Colossal OSCAR 1 [la; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_la</code> Title: Colossal OSCAR 1 [la; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_la/#colossal-oscar-1-la-2023-23","title":"Colossal OSCAR 1 [la; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_la</code> Title: Colossal OSCAR 1 [la; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_lb/","title":"Luxembourgish Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Luxembourgish language.</p>"},{"location":"datasets/language_lb/#colossal-oscar-1-lb-2015-14","title":"Colossal OSCAR 1 [lb; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_lb</code> Title: Colossal OSCAR 1 [lb; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lb/#colossal-oscar-1-lb-2016-40","title":"Colossal OSCAR 1 [lb; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_lb</code> Title: Colossal OSCAR 1 [lb; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lb/#colossal-oscar-1-lb-2017-43","title":"Colossal OSCAR 1 [lb; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_lb</code> Title: Colossal OSCAR 1 [lb; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lb/#colossal-oscar-1-lb-2018-47","title":"Colossal OSCAR 1 [lb; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_lb</code> Title: Colossal OSCAR 1 [lb; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lb/#colossal-oscar-1-lb-2019-22","title":"Colossal OSCAR 1 [lb; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_lb</code> Title: Colossal OSCAR 1 [lb; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lb/#colossal-oscar-1-lb-2020-24","title":"Colossal OSCAR 1 [lb; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_lb</code> Title: Colossal OSCAR 1 [lb; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lb/#colossal-oscar-1-lb-2020-45","title":"Colossal OSCAR 1 [lb; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_lb</code> Title: Colossal OSCAR 1 [lb; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lb/#colossal-oscar-1-lb-2021-49","title":"Colossal OSCAR 1 [lb; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_lb</code> Title: Colossal OSCAR 1 [lb; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lb/#colossal-oscar-1-lb-2022-27","title":"Colossal OSCAR 1 [lb; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_lb</code> Title: Colossal OSCAR 1 [lb; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lb/#colossal-oscar-1-lb-2022-49","title":"Colossal OSCAR 1 [lb; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_lb</code> Title: Colossal OSCAR 1 [lb; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lb/#colossal-oscar-1-lb-2023-14","title":"Colossal OSCAR 1 [lb; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_lb</code> Title: Colossal OSCAR 1 [lb; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lb/#colossal-oscar-1-lb-2023-23","title":"Colossal OSCAR 1 [lb; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_lb</code> Title: Colossal OSCAR 1 [lb; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_lez/","title":"Lez Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Lez language.</p>"},{"location":"datasets/language_lez/#colossal-oscar-1-lez-2015-14","title":"Colossal OSCAR 1 [lez; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_lez</code> Title: Colossal OSCAR 1 [lez; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lez/#colossal-oscar-1-lez-2016-40","title":"Colossal OSCAR 1 [lez; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_lez</code> Title: Colossal OSCAR 1 [lez; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lez/#colossal-oscar-1-lez-2017-43","title":"Colossal OSCAR 1 [lez; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_lez</code> Title: Colossal OSCAR 1 [lez; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lez/#colossal-oscar-1-lez-2018-47","title":"Colossal OSCAR 1 [lez; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_lez</code> Title: Colossal OSCAR 1 [lez; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lez/#colossal-oscar-1-lez-2019-22","title":"Colossal OSCAR 1 [lez; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_lez</code> Title: Colossal OSCAR 1 [lez; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lez/#colossal-oscar-1-lez-2020-24","title":"Colossal OSCAR 1 [lez; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_lez</code> Title: Colossal OSCAR 1 [lez; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lez/#colossal-oscar-1-lez-2020-45","title":"Colossal OSCAR 1 [lez; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_lez</code> Title: Colossal OSCAR 1 [lez; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lez/#colossal-oscar-1-lez-2021-49","title":"Colossal OSCAR 1 [lez; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_lez</code> Title: Colossal OSCAR 1 [lez; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lez/#colossal-oscar-1-lez-2022-27","title":"Colossal OSCAR 1 [lez; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_lez</code> Title: Colossal OSCAR 1 [lez; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lez/#colossal-oscar-1-lez-2022-49","title":"Colossal OSCAR 1 [lez; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_lez</code> Title: Colossal OSCAR 1 [lez; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lez/#colossal-oscar-1-lez-2023-14","title":"Colossal OSCAR 1 [lez; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_lez</code> Title: Colossal OSCAR 1 [lez; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lez/#colossal-oscar-1-lez-2023-23","title":"Colossal OSCAR 1 [lez; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_lez</code> Title: Colossal OSCAR 1 [lez; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_li/","title":"Limburgish Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Limburgish language.</p>"},{"location":"datasets/language_li/#colossal-oscar-1-li-2015-14","title":"Colossal OSCAR 1 [li; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_li</code> Title: Colossal OSCAR 1 [li; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_li/#colossal-oscar-1-li-2016-40","title":"Colossal OSCAR 1 [li; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_li</code> Title: Colossal OSCAR 1 [li; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_li/#colossal-oscar-1-li-2017-43","title":"Colossal OSCAR 1 [li; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_li</code> Title: Colossal OSCAR 1 [li; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_li/#colossal-oscar-1-li-2018-47","title":"Colossal OSCAR 1 [li; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_li</code> Title: Colossal OSCAR 1 [li; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_li/#colossal-oscar-1-li-2019-22","title":"Colossal OSCAR 1 [li; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_li</code> Title: Colossal OSCAR 1 [li; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_li/#colossal-oscar-1-li-2020-24","title":"Colossal OSCAR 1 [li; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_li</code> Title: Colossal OSCAR 1 [li; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_li/#colossal-oscar-1-li-2020-45","title":"Colossal OSCAR 1 [li; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_li</code> Title: Colossal OSCAR 1 [li; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_li/#colossal-oscar-1-li-2021-49","title":"Colossal OSCAR 1 [li; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_li</code> Title: Colossal OSCAR 1 [li; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_li/#colossal-oscar-1-li-2022-27","title":"Colossal OSCAR 1 [li; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_li</code> Title: Colossal OSCAR 1 [li; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_li/#colossal-oscar-1-li-2022-49","title":"Colossal OSCAR 1 [li; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_li</code> Title: Colossal OSCAR 1 [li; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_li/#colossal-oscar-1-li-2023-14","title":"Colossal OSCAR 1 [li; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_li</code> Title: Colossal OSCAR 1 [li; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_li/#colossal-oscar-1-li-2023-23","title":"Colossal OSCAR 1 [li; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_li</code> Title: Colossal OSCAR 1 [li; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_lmo/","title":"Lmo Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Lmo language.</p>"},{"location":"datasets/language_lmo/#colossal-oscar-1-lmo-2015-14","title":"Colossal OSCAR 1 [lmo; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_lmo</code> Title: Colossal OSCAR 1 [lmo; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lmo/#colossal-oscar-1-lmo-2016-40","title":"Colossal OSCAR 1 [lmo; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_lmo</code> Title: Colossal OSCAR 1 [lmo; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lmo/#colossal-oscar-1-lmo-2017-43","title":"Colossal OSCAR 1 [lmo; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_lmo</code> Title: Colossal OSCAR 1 [lmo; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lmo/#colossal-oscar-1-lmo-2018-47","title":"Colossal OSCAR 1 [lmo; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_lmo</code> Title: Colossal OSCAR 1 [lmo; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lmo/#colossal-oscar-1-lmo-2019-22","title":"Colossal OSCAR 1 [lmo; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_lmo</code> Title: Colossal OSCAR 1 [lmo; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lmo/#colossal-oscar-1-lmo-2020-24","title":"Colossal OSCAR 1 [lmo; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_lmo</code> Title: Colossal OSCAR 1 [lmo; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lmo/#colossal-oscar-1-lmo-2020-45","title":"Colossal OSCAR 1 [lmo; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_lmo</code> Title: Colossal OSCAR 1 [lmo; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lmo/#colossal-oscar-1-lmo-2021-49","title":"Colossal OSCAR 1 [lmo; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_lmo</code> Title: Colossal OSCAR 1 [lmo; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lmo/#colossal-oscar-1-lmo-2022-27","title":"Colossal OSCAR 1 [lmo; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_lmo</code> Title: Colossal OSCAR 1 [lmo; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lmo/#colossal-oscar-1-lmo-2022-49","title":"Colossal OSCAR 1 [lmo; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_lmo</code> Title: Colossal OSCAR 1 [lmo; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lmo/#colossal-oscar-1-lmo-2023-14","title":"Colossal OSCAR 1 [lmo; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_lmo</code> Title: Colossal OSCAR 1 [lmo; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lmo/#colossal-oscar-1-lmo-2023-23","title":"Colossal OSCAR 1 [lmo; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_lmo</code> Title: Colossal OSCAR 1 [lmo; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_lo/","title":"Lao Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Lao language.</p>"},{"location":"datasets/language_lo/#colossal-oscar-1-lo-2015-14","title":"Colossal OSCAR 1 [lo; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_lo</code> Title: Colossal OSCAR 1 [lo; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lo/#colossal-oscar-1-lo-2016-40","title":"Colossal OSCAR 1 [lo; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_lo</code> Title: Colossal OSCAR 1 [lo; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lo/#colossal-oscar-1-lo-2017-43","title":"Colossal OSCAR 1 [lo; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_lo</code> Title: Colossal OSCAR 1 [lo; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lo/#colossal-oscar-1-lo-2018-47","title":"Colossal OSCAR 1 [lo; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_lo</code> Title: Colossal OSCAR 1 [lo; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lo/#colossal-oscar-1-lo-2019-22","title":"Colossal OSCAR 1 [lo; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_lo</code> Title: Colossal OSCAR 1 [lo; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lo/#colossal-oscar-1-lo-2020-24","title":"Colossal OSCAR 1 [lo; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_lo</code> Title: Colossal OSCAR 1 [lo; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lo/#colossal-oscar-1-lo-2020-45","title":"Colossal OSCAR 1 [lo; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_lo</code> Title: Colossal OSCAR 1 [lo; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lo/#colossal-oscar-1-lo-2021-49","title":"Colossal OSCAR 1 [lo; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_lo</code> Title: Colossal OSCAR 1 [lo; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lo/#colossal-oscar-1-lo-2022-27","title":"Colossal OSCAR 1 [lo; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_lo</code> Title: Colossal OSCAR 1 [lo; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lo/#colossal-oscar-1-lo-2022-49","title":"Colossal OSCAR 1 [lo; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_lo</code> Title: Colossal OSCAR 1 [lo; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lo/#colossal-oscar-1-lo-2023-14","title":"Colossal OSCAR 1 [lo; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_lo</code> Title: Colossal OSCAR 1 [lo; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lo/#colossal-oscar-1-lo-2023-23","title":"Colossal OSCAR 1 [lo; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_lo</code> Title: Colossal OSCAR 1 [lo; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_lt/","title":"Lithuanian Datasets","text":"<p>There are in total 19 datasets with 5 B tokens in Lithuanian language.</p>"},{"location":"datasets/language_lt/#bilingual-english-lithuanian-parallel-corpus-from-seimas-of-the-republic-of-lithuania-website","title":"Bilingual English-Lithuanian parallel corpus from Seimas of the Republic of Lithuania website","text":"Dataset ID: <code>seimas_lt_en</code> Title: Bilingual English-Lithuanian parallel corpus from Seimas of the Republic of Lithuania website Description: Contents of http://www.lrs.lt were crawled, aligned on document and sentence level and converted into a parallel corpus. Availibility: <code>None</code> Homepage: [https://live.european-language-grid.eu/catalogue/corpus/3009/download/] License: Open under PSI (commercial use: None, sharealike: None) Tokens: 48 k"},{"location":"datasets/language_lt/#colossal-oscar-1-lt-2015-14","title":"Colossal OSCAR 1 [lt; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_lt</code> Title: Colossal OSCAR 1 [lt; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lt/#colossal-oscar-1-lt-2016-40","title":"Colossal OSCAR 1 [lt; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_lt</code> Title: Colossal OSCAR 1 [lt; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lt/#colossal-oscar-1-lt-2017-43","title":"Colossal OSCAR 1 [lt; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_lt</code> Title: Colossal OSCAR 1 [lt; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lt/#colossal-oscar-1-lt-2018-47","title":"Colossal OSCAR 1 [lt; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_lt</code> Title: Colossal OSCAR 1 [lt; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lt/#colossal-oscar-1-lt-2019-22","title":"Colossal OSCAR 1 [lt; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_lt</code> Title: Colossal OSCAR 1 [lt; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lt/#colossal-oscar-1-lt-2020-24","title":"Colossal OSCAR 1 [lt; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_lt</code> Title: Colossal OSCAR 1 [lt; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lt/#colossal-oscar-1-lt-2020-45","title":"Colossal OSCAR 1 [lt; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_lt</code> Title: Colossal OSCAR 1 [lt; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lt/#colossal-oscar-1-lt-2021-49","title":"Colossal OSCAR 1 [lt; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_lt</code> Title: Colossal OSCAR 1 [lt; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lt/#colossal-oscar-1-lt-2022-27","title":"Colossal OSCAR 1 [lt; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_lt</code> Title: Colossal OSCAR 1 [lt; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lt/#colossal-oscar-1-lt-2022-49","title":"Colossal OSCAR 1 [lt; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_lt</code> Title: Colossal OSCAR 1 [lt; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lt/#colossal-oscar-1-lt-2023-14","title":"Colossal OSCAR 1 [lt; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_lt</code> Title: Colossal OSCAR 1 [lt; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lt/#colossal-oscar-1-lt-2023-23","title":"Colossal OSCAR 1 [lt; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_lt</code> Title: Colossal OSCAR 1 [lt; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 2 B"},{"location":"datasets/language_lt/#eurlexresources-lt","title":"EurlexResources [lt]","text":"Dataset ID: <code>eurlex_lt</code> Title: EurlexResources [lt] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 4 B"},{"location":"datasets/language_lt/#legalmc4-lt","title":"LegalMC4 [lt]","text":"Dataset ID: <code>legal_mc4_lt</code> Title: LegalMC4 [lt] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 9 M"},{"location":"datasets/language_lt/#wikibooks-lt","title":"Wikibooks [lt]","text":"Dataset ID: <code>wikibooks_lt</code> Title: Wikibooks [lt] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 594 k"},{"location":"datasets/language_lt/#wikipedia-lt","title":"Wikipedia [lt]","text":"Dataset ID: <code>wiki_lt</code> Title: Wikipedia [lt] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 33 M"},{"location":"datasets/language_lt/#wikiquote-lt","title":"Wikiquote [lt]","text":"Dataset ID: <code>wikiquote_lt</code> Title: Wikiquote [lt] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 1 M"},{"location":"datasets/language_lt/#wikisource-lt","title":"Wikisource [lt]","text":"Dataset ID: <code>wikisource_lt</code> Title: Wikisource [lt] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 3 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_lv/","title":"Latvian Datasets","text":"<p>There are in total 17 datasets with 4 B tokens in Latvian language.</p>"},{"location":"datasets/language_lv/#colossal-oscar-1-lv-2015-14","title":"Colossal OSCAR 1 [lv; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_lv</code> Title: Colossal OSCAR 1 [lv; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lv/#colossal-oscar-1-lv-2016-40","title":"Colossal OSCAR 1 [lv; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_lv</code> Title: Colossal OSCAR 1 [lv; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lv/#colossal-oscar-1-lv-2017-43","title":"Colossal OSCAR 1 [lv; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_lv</code> Title: Colossal OSCAR 1 [lv; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lv/#colossal-oscar-1-lv-2018-47","title":"Colossal OSCAR 1 [lv; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_lv</code> Title: Colossal OSCAR 1 [lv; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lv/#colossal-oscar-1-lv-2019-22","title":"Colossal OSCAR 1 [lv; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_lv</code> Title: Colossal OSCAR 1 [lv; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lv/#colossal-oscar-1-lv-2020-24","title":"Colossal OSCAR 1 [lv; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_lv</code> Title: Colossal OSCAR 1 [lv; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lv/#colossal-oscar-1-lv-2020-45","title":"Colossal OSCAR 1 [lv; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_lv</code> Title: Colossal OSCAR 1 [lv; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lv/#colossal-oscar-1-lv-2021-49","title":"Colossal OSCAR 1 [lv; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_lv</code> Title: Colossal OSCAR 1 [lv; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lv/#colossal-oscar-1-lv-2022-27","title":"Colossal OSCAR 1 [lv; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_lv</code> Title: Colossal OSCAR 1 [lv; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lv/#colossal-oscar-1-lv-2022-49","title":"Colossal OSCAR 1 [lv; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_lv</code> Title: Colossal OSCAR 1 [lv; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lv/#colossal-oscar-1-lv-2023-14","title":"Colossal OSCAR 1 [lv; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_lv</code> Title: Colossal OSCAR 1 [lv; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_lv/#colossal-oscar-1-lv-2023-23","title":"Colossal OSCAR 1 [lv; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_lv</code> Title: Colossal OSCAR 1 [lv; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 792 M"},{"location":"datasets/language_lv/#corpus-of-state-related-content-from-the-latvian-web-processed","title":"Corpus of State-related content from the Latvian Web (Processed)","text":"Dataset ID: <code>state_related_latvian_web</code> Title: Corpus of State-related content from the Latvian Web (Processed) Description: Latvian Web, home pages of ministries and state public services, army, etc. were crawled, and parallel Latvian-English content was collected. Availibility: <code>signin_download</code> Homepage: [http://catalog.elra.info/en-us/repository/browse/ELRA-W0169/] License: CC-BY-SA-4.0 (commercial use: True, sharealike: True) Tokens: 1 M"},{"location":"datasets/language_lv/#eurlexresources-lv","title":"EurlexResources [lv]","text":"Dataset ID: <code>eurlex_lv</code> Title: EurlexResources [lv] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 4 B"},{"location":"datasets/language_lv/#legalmc4-lv","title":"LegalMC4 [lv]","text":"Dataset ID: <code>legal_mc4_lv</code> Title: LegalMC4 [lv] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 59 k"},{"location":"datasets/language_lv/#wikibooks-lv","title":"Wikibooks [lv]","text":"Dataset ID: <code>wikibooks_lv</code> Title: Wikibooks [lv] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 33 k"},{"location":"datasets/language_lv/#wikipedia-lv","title":"Wikipedia [lv]","text":"Dataset ID: <code>wiki_lv</code> Title: Wikipedia [lv] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 29 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_mai/","title":"Mai Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Mai language.</p>"},{"location":"datasets/language_mai/#colossal-oscar-1-mai-2015-14","title":"Colossal OSCAR 1 [mai; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_mai</code> Title: Colossal OSCAR 1 [mai; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mai/#colossal-oscar-1-mai-2016-40","title":"Colossal OSCAR 1 [mai; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_mai</code> Title: Colossal OSCAR 1 [mai; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mai/#colossal-oscar-1-mai-2017-43","title":"Colossal OSCAR 1 [mai; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_mai</code> Title: Colossal OSCAR 1 [mai; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mai/#colossal-oscar-1-mai-2018-47","title":"Colossal OSCAR 1 [mai; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_mai</code> Title: Colossal OSCAR 1 [mai; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mai/#colossal-oscar-1-mai-2019-22","title":"Colossal OSCAR 1 [mai; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_mai</code> Title: Colossal OSCAR 1 [mai; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mai/#colossal-oscar-1-mai-2020-24","title":"Colossal OSCAR 1 [mai; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_mai</code> Title: Colossal OSCAR 1 [mai; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mai/#colossal-oscar-1-mai-2020-45","title":"Colossal OSCAR 1 [mai; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_mai</code> Title: Colossal OSCAR 1 [mai; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mai/#colossal-oscar-1-mai-2021-49","title":"Colossal OSCAR 1 [mai; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_mai</code> Title: Colossal OSCAR 1 [mai; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mai/#colossal-oscar-1-mai-2022-27","title":"Colossal OSCAR 1 [mai; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_mai</code> Title: Colossal OSCAR 1 [mai; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mai/#colossal-oscar-1-mai-2022-49","title":"Colossal OSCAR 1 [mai; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_mai</code> Title: Colossal OSCAR 1 [mai; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mai/#colossal-oscar-1-mai-2023-14","title":"Colossal OSCAR 1 [mai; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_mai</code> Title: Colossal OSCAR 1 [mai; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mai/#colossal-oscar-1-mai-2023-23","title":"Colossal OSCAR 1 [mai; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_mai</code> Title: Colossal OSCAR 1 [mai; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_mg/","title":"Malagasy Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Malagasy language.</p>"},{"location":"datasets/language_mg/#colossal-oscar-1-mg-2015-14","title":"Colossal OSCAR 1 [mg; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_mg</code> Title: Colossal OSCAR 1 [mg; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mg/#colossal-oscar-1-mg-2016-40","title":"Colossal OSCAR 1 [mg; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_mg</code> Title: Colossal OSCAR 1 [mg; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mg/#colossal-oscar-1-mg-2017-43","title":"Colossal OSCAR 1 [mg; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_mg</code> Title: Colossal OSCAR 1 [mg; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mg/#colossal-oscar-1-mg-2018-47","title":"Colossal OSCAR 1 [mg; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_mg</code> Title: Colossal OSCAR 1 [mg; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mg/#colossal-oscar-1-mg-2019-22","title":"Colossal OSCAR 1 [mg; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_mg</code> Title: Colossal OSCAR 1 [mg; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mg/#colossal-oscar-1-mg-2020-24","title":"Colossal OSCAR 1 [mg; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_mg</code> Title: Colossal OSCAR 1 [mg; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mg/#colossal-oscar-1-mg-2020-45","title":"Colossal OSCAR 1 [mg; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_mg</code> Title: Colossal OSCAR 1 [mg; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mg/#colossal-oscar-1-mg-2021-49","title":"Colossal OSCAR 1 [mg; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_mg</code> Title: Colossal OSCAR 1 [mg; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mg/#colossal-oscar-1-mg-2022-27","title":"Colossal OSCAR 1 [mg; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_mg</code> Title: Colossal OSCAR 1 [mg; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mg/#colossal-oscar-1-mg-2022-49","title":"Colossal OSCAR 1 [mg; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_mg</code> Title: Colossal OSCAR 1 [mg; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mg/#colossal-oscar-1-mg-2023-14","title":"Colossal OSCAR 1 [mg; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_mg</code> Title: Colossal OSCAR 1 [mg; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mg/#colossal-oscar-1-mg-2023-23","title":"Colossal OSCAR 1 [mg; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_mg</code> Title: Colossal OSCAR 1 [mg; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_mhr/","title":"Mhr Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Mhr language.</p>"},{"location":"datasets/language_mhr/#colossal-oscar-1-mhr-2015-14","title":"Colossal OSCAR 1 [mhr; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_mhr</code> Title: Colossal OSCAR 1 [mhr; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mhr/#colossal-oscar-1-mhr-2016-40","title":"Colossal OSCAR 1 [mhr; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_mhr</code> Title: Colossal OSCAR 1 [mhr; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mhr/#colossal-oscar-1-mhr-2017-43","title":"Colossal OSCAR 1 [mhr; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_mhr</code> Title: Colossal OSCAR 1 [mhr; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mhr/#colossal-oscar-1-mhr-2018-47","title":"Colossal OSCAR 1 [mhr; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_mhr</code> Title: Colossal OSCAR 1 [mhr; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mhr/#colossal-oscar-1-mhr-2019-22","title":"Colossal OSCAR 1 [mhr; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_mhr</code> Title: Colossal OSCAR 1 [mhr; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mhr/#colossal-oscar-1-mhr-2020-24","title":"Colossal OSCAR 1 [mhr; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_mhr</code> Title: Colossal OSCAR 1 [mhr; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mhr/#colossal-oscar-1-mhr-2020-45","title":"Colossal OSCAR 1 [mhr; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_mhr</code> Title: Colossal OSCAR 1 [mhr; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mhr/#colossal-oscar-1-mhr-2021-49","title":"Colossal OSCAR 1 [mhr; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_mhr</code> Title: Colossal OSCAR 1 [mhr; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mhr/#colossal-oscar-1-mhr-2022-27","title":"Colossal OSCAR 1 [mhr; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_mhr</code> Title: Colossal OSCAR 1 [mhr; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mhr/#colossal-oscar-1-mhr-2022-49","title":"Colossal OSCAR 1 [mhr; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_mhr</code> Title: Colossal OSCAR 1 [mhr; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mhr/#colossal-oscar-1-mhr-2023-14","title":"Colossal OSCAR 1 [mhr; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_mhr</code> Title: Colossal OSCAR 1 [mhr; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mhr/#colossal-oscar-1-mhr-2023-23","title":"Colossal OSCAR 1 [mhr; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_mhr</code> Title: Colossal OSCAR 1 [mhr; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_min/","title":"Min Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Min language.</p>"},{"location":"datasets/language_min/#colossal-oscar-1-min-2015-14","title":"Colossal OSCAR 1 [min; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_min</code> Title: Colossal OSCAR 1 [min; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_min/#colossal-oscar-1-min-2016-40","title":"Colossal OSCAR 1 [min; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_min</code> Title: Colossal OSCAR 1 [min; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_min/#colossal-oscar-1-min-2017-43","title":"Colossal OSCAR 1 [min; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_min</code> Title: Colossal OSCAR 1 [min; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_min/#colossal-oscar-1-min-2018-47","title":"Colossal OSCAR 1 [min; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_min</code> Title: Colossal OSCAR 1 [min; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_min/#colossal-oscar-1-min-2019-22","title":"Colossal OSCAR 1 [min; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_min</code> Title: Colossal OSCAR 1 [min; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_min/#colossal-oscar-1-min-2020-24","title":"Colossal OSCAR 1 [min; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_min</code> Title: Colossal OSCAR 1 [min; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_min/#colossal-oscar-1-min-2020-45","title":"Colossal OSCAR 1 [min; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_min</code> Title: Colossal OSCAR 1 [min; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_min/#colossal-oscar-1-min-2021-49","title":"Colossal OSCAR 1 [min; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_min</code> Title: Colossal OSCAR 1 [min; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_min/#colossal-oscar-1-min-2022-27","title":"Colossal OSCAR 1 [min; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_min</code> Title: Colossal OSCAR 1 [min; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_min/#colossal-oscar-1-min-2022-49","title":"Colossal OSCAR 1 [min; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_min</code> Title: Colossal OSCAR 1 [min; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_min/#colossal-oscar-1-min-2023-14","title":"Colossal OSCAR 1 [min; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_min</code> Title: Colossal OSCAR 1 [min; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_min/#colossal-oscar-1-min-2023-23","title":"Colossal OSCAR 1 [min; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_min</code> Title: Colossal OSCAR 1 [min; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_mk/","title":"Macedonian Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Macedonian language.</p>"},{"location":"datasets/language_mk/#colossal-oscar-1-mk-2015-14","title":"Colossal OSCAR 1 [mk; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_mk</code> Title: Colossal OSCAR 1 [mk; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mk/#colossal-oscar-1-mk-2016-40","title":"Colossal OSCAR 1 [mk; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_mk</code> Title: Colossal OSCAR 1 [mk; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mk/#colossal-oscar-1-mk-2017-43","title":"Colossal OSCAR 1 [mk; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_mk</code> Title: Colossal OSCAR 1 [mk; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mk/#colossal-oscar-1-mk-2018-47","title":"Colossal OSCAR 1 [mk; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_mk</code> Title: Colossal OSCAR 1 [mk; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mk/#colossal-oscar-1-mk-2019-22","title":"Colossal OSCAR 1 [mk; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_mk</code> Title: Colossal OSCAR 1 [mk; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mk/#colossal-oscar-1-mk-2020-24","title":"Colossal OSCAR 1 [mk; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_mk</code> Title: Colossal OSCAR 1 [mk; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mk/#colossal-oscar-1-mk-2020-45","title":"Colossal OSCAR 1 [mk; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_mk</code> Title: Colossal OSCAR 1 [mk; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mk/#colossal-oscar-1-mk-2021-49","title":"Colossal OSCAR 1 [mk; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_mk</code> Title: Colossal OSCAR 1 [mk; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mk/#colossal-oscar-1-mk-2022-27","title":"Colossal OSCAR 1 [mk; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_mk</code> Title: Colossal OSCAR 1 [mk; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mk/#colossal-oscar-1-mk-2022-49","title":"Colossal OSCAR 1 [mk; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_mk</code> Title: Colossal OSCAR 1 [mk; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mk/#colossal-oscar-1-mk-2023-14","title":"Colossal OSCAR 1 [mk; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_mk</code> Title: Colossal OSCAR 1 [mk; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mk/#colossal-oscar-1-mk-2023-23","title":"Colossal OSCAR 1 [mk; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_mk</code> Title: Colossal OSCAR 1 [mk; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ml/","title":"Malayalam Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Malayalam language.</p>"},{"location":"datasets/language_ml/#colossal-oscar-1-ml-2015-14","title":"Colossal OSCAR 1 [ml; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ml</code> Title: Colossal OSCAR 1 [ml; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ml/#colossal-oscar-1-ml-2016-40","title":"Colossal OSCAR 1 [ml; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ml</code> Title: Colossal OSCAR 1 [ml; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ml/#colossal-oscar-1-ml-2017-43","title":"Colossal OSCAR 1 [ml; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ml</code> Title: Colossal OSCAR 1 [ml; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ml/#colossal-oscar-1-ml-2018-47","title":"Colossal OSCAR 1 [ml; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ml</code> Title: Colossal OSCAR 1 [ml; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ml/#colossal-oscar-1-ml-2019-22","title":"Colossal OSCAR 1 [ml; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ml</code> Title: Colossal OSCAR 1 [ml; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ml/#colossal-oscar-1-ml-2020-24","title":"Colossal OSCAR 1 [ml; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ml</code> Title: Colossal OSCAR 1 [ml; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ml/#colossal-oscar-1-ml-2020-45","title":"Colossal OSCAR 1 [ml; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ml</code> Title: Colossal OSCAR 1 [ml; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ml/#colossal-oscar-1-ml-2021-49","title":"Colossal OSCAR 1 [ml; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ml</code> Title: Colossal OSCAR 1 [ml; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ml/#colossal-oscar-1-ml-2022-27","title":"Colossal OSCAR 1 [ml; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ml</code> Title: Colossal OSCAR 1 [ml; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ml/#colossal-oscar-1-ml-2022-49","title":"Colossal OSCAR 1 [ml; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ml</code> Title: Colossal OSCAR 1 [ml; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ml/#colossal-oscar-1-ml-2023-14","title":"Colossal OSCAR 1 [ml; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ml</code> Title: Colossal OSCAR 1 [ml; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ml/#colossal-oscar-1-ml-2023-23","title":"Colossal OSCAR 1 [ml; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ml</code> Title: Colossal OSCAR 1 [ml; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_mn/","title":"Mongolian Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Mongolian language.</p>"},{"location":"datasets/language_mn/#colossal-oscar-1-mn-2015-14","title":"Colossal OSCAR 1 [mn; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_mn</code> Title: Colossal OSCAR 1 [mn; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mn/#colossal-oscar-1-mn-2016-40","title":"Colossal OSCAR 1 [mn; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_mn</code> Title: Colossal OSCAR 1 [mn; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mn/#colossal-oscar-1-mn-2017-43","title":"Colossal OSCAR 1 [mn; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_mn</code> Title: Colossal OSCAR 1 [mn; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mn/#colossal-oscar-1-mn-2018-47","title":"Colossal OSCAR 1 [mn; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_mn</code> Title: Colossal OSCAR 1 [mn; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mn/#colossal-oscar-1-mn-2019-22","title":"Colossal OSCAR 1 [mn; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_mn</code> Title: Colossal OSCAR 1 [mn; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mn/#colossal-oscar-1-mn-2020-24","title":"Colossal OSCAR 1 [mn; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_mn</code> Title: Colossal OSCAR 1 [mn; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mn/#colossal-oscar-1-mn-2020-45","title":"Colossal OSCAR 1 [mn; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_mn</code> Title: Colossal OSCAR 1 [mn; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mn/#colossal-oscar-1-mn-2021-49","title":"Colossal OSCAR 1 [mn; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_mn</code> Title: Colossal OSCAR 1 [mn; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mn/#colossal-oscar-1-mn-2022-27","title":"Colossal OSCAR 1 [mn; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_mn</code> Title: Colossal OSCAR 1 [mn; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mn/#colossal-oscar-1-mn-2022-49","title":"Colossal OSCAR 1 [mn; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_mn</code> Title: Colossal OSCAR 1 [mn; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mn/#colossal-oscar-1-mn-2023-14","title":"Colossal OSCAR 1 [mn; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_mn</code> Title: Colossal OSCAR 1 [mn; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mn/#colossal-oscar-1-mn-2023-23","title":"Colossal OSCAR 1 [mn; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_mn</code> Title: Colossal OSCAR 1 [mn; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_mr/","title":"Marathi Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Marathi language.</p>"},{"location":"datasets/language_mr/#colossal-oscar-1-mr-2015-14","title":"Colossal OSCAR 1 [mr; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_mr</code> Title: Colossal OSCAR 1 [mr; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mr/#colossal-oscar-1-mr-2016-40","title":"Colossal OSCAR 1 [mr; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_mr</code> Title: Colossal OSCAR 1 [mr; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mr/#colossal-oscar-1-mr-2017-43","title":"Colossal OSCAR 1 [mr; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_mr</code> Title: Colossal OSCAR 1 [mr; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mr/#colossal-oscar-1-mr-2018-47","title":"Colossal OSCAR 1 [mr; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_mr</code> Title: Colossal OSCAR 1 [mr; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mr/#colossal-oscar-1-mr-2019-22","title":"Colossal OSCAR 1 [mr; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_mr</code> Title: Colossal OSCAR 1 [mr; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mr/#colossal-oscar-1-mr-2020-24","title":"Colossal OSCAR 1 [mr; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_mr</code> Title: Colossal OSCAR 1 [mr; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mr/#colossal-oscar-1-mr-2020-45","title":"Colossal OSCAR 1 [mr; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_mr</code> Title: Colossal OSCAR 1 [mr; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mr/#colossal-oscar-1-mr-2021-49","title":"Colossal OSCAR 1 [mr; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_mr</code> Title: Colossal OSCAR 1 [mr; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mr/#colossal-oscar-1-mr-2022-27","title":"Colossal OSCAR 1 [mr; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_mr</code> Title: Colossal OSCAR 1 [mr; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mr/#colossal-oscar-1-mr-2022-49","title":"Colossal OSCAR 1 [mr; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_mr</code> Title: Colossal OSCAR 1 [mr; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mr/#colossal-oscar-1-mr-2023-14","title":"Colossal OSCAR 1 [mr; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_mr</code> Title: Colossal OSCAR 1 [mr; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mr/#colossal-oscar-1-mr-2023-23","title":"Colossal OSCAR 1 [mr; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_mr</code> Title: Colossal OSCAR 1 [mr; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_mrj/","title":"Mrj Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Mrj language.</p>"},{"location":"datasets/language_mrj/#colossal-oscar-1-mrj-2015-14","title":"Colossal OSCAR 1 [mrj; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_mrj</code> Title: Colossal OSCAR 1 [mrj; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mrj/#colossal-oscar-1-mrj-2016-40","title":"Colossal OSCAR 1 [mrj; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_mrj</code> Title: Colossal OSCAR 1 [mrj; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mrj/#colossal-oscar-1-mrj-2017-43","title":"Colossal OSCAR 1 [mrj; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_mrj</code> Title: Colossal OSCAR 1 [mrj; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mrj/#colossal-oscar-1-mrj-2018-47","title":"Colossal OSCAR 1 [mrj; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_mrj</code> Title: Colossal OSCAR 1 [mrj; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mrj/#colossal-oscar-1-mrj-2019-22","title":"Colossal OSCAR 1 [mrj; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_mrj</code> Title: Colossal OSCAR 1 [mrj; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mrj/#colossal-oscar-1-mrj-2020-24","title":"Colossal OSCAR 1 [mrj; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_mrj</code> Title: Colossal OSCAR 1 [mrj; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mrj/#colossal-oscar-1-mrj-2020-45","title":"Colossal OSCAR 1 [mrj; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_mrj</code> Title: Colossal OSCAR 1 [mrj; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mrj/#colossal-oscar-1-mrj-2021-49","title":"Colossal OSCAR 1 [mrj; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_mrj</code> Title: Colossal OSCAR 1 [mrj; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mrj/#colossal-oscar-1-mrj-2022-27","title":"Colossal OSCAR 1 [mrj; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_mrj</code> Title: Colossal OSCAR 1 [mrj; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mrj/#colossal-oscar-1-mrj-2022-49","title":"Colossal OSCAR 1 [mrj; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_mrj</code> Title: Colossal OSCAR 1 [mrj; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mrj/#colossal-oscar-1-mrj-2023-14","title":"Colossal OSCAR 1 [mrj; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_mrj</code> Title: Colossal OSCAR 1 [mrj; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mrj/#colossal-oscar-1-mrj-2023-23","title":"Colossal OSCAR 1 [mrj; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_mrj</code> Title: Colossal OSCAR 1 [mrj; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ms/","title":"Malay Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Malay language.</p>"},{"location":"datasets/language_ms/#colossal-oscar-1-ms-2015-14","title":"Colossal OSCAR 1 [ms; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ms</code> Title: Colossal OSCAR 1 [ms; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ms/#colossal-oscar-1-ms-2016-40","title":"Colossal OSCAR 1 [ms; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ms</code> Title: Colossal OSCAR 1 [ms; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ms/#colossal-oscar-1-ms-2017-43","title":"Colossal OSCAR 1 [ms; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ms</code> Title: Colossal OSCAR 1 [ms; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ms/#colossal-oscar-1-ms-2018-47","title":"Colossal OSCAR 1 [ms; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ms</code> Title: Colossal OSCAR 1 [ms; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ms/#colossal-oscar-1-ms-2019-22","title":"Colossal OSCAR 1 [ms; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ms</code> Title: Colossal OSCAR 1 [ms; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ms/#colossal-oscar-1-ms-2020-24","title":"Colossal OSCAR 1 [ms; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ms</code> Title: Colossal OSCAR 1 [ms; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ms/#colossal-oscar-1-ms-2020-45","title":"Colossal OSCAR 1 [ms; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ms</code> Title: Colossal OSCAR 1 [ms; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ms/#colossal-oscar-1-ms-2021-49","title":"Colossal OSCAR 1 [ms; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ms</code> Title: Colossal OSCAR 1 [ms; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ms/#colossal-oscar-1-ms-2022-27","title":"Colossal OSCAR 1 [ms; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ms</code> Title: Colossal OSCAR 1 [ms; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ms/#colossal-oscar-1-ms-2022-49","title":"Colossal OSCAR 1 [ms; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ms</code> Title: Colossal OSCAR 1 [ms; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ms/#colossal-oscar-1-ms-2023-14","title":"Colossal OSCAR 1 [ms; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ms</code> Title: Colossal OSCAR 1 [ms; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ms/#colossal-oscar-1-ms-2023-23","title":"Colossal OSCAR 1 [ms; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ms</code> Title: Colossal OSCAR 1 [ms; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_mt/","title":"Maltese Datasets","text":"<p>There are in total 17 datasets with 4 B tokens in Maltese language.</p>"},{"location":"datasets/language_mt/#colossal-oscar-1-mt-2015-14","title":"Colossal OSCAR 1 [mt; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_mt</code> Title: Colossal OSCAR 1 [mt; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mt/#colossal-oscar-1-mt-2016-40","title":"Colossal OSCAR 1 [mt; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_mt</code> Title: Colossal OSCAR 1 [mt; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mt/#colossal-oscar-1-mt-2017-43","title":"Colossal OSCAR 1 [mt; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_mt</code> Title: Colossal OSCAR 1 [mt; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mt/#colossal-oscar-1-mt-2018-47","title":"Colossal OSCAR 1 [mt; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_mt</code> Title: Colossal OSCAR 1 [mt; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mt/#colossal-oscar-1-mt-2019-22","title":"Colossal OSCAR 1 [mt; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_mt</code> Title: Colossal OSCAR 1 [mt; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mt/#colossal-oscar-1-mt-2020-24","title":"Colossal OSCAR 1 [mt; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_mt</code> Title: Colossal OSCAR 1 [mt; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mt/#colossal-oscar-1-mt-2020-45","title":"Colossal OSCAR 1 [mt; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_mt</code> Title: Colossal OSCAR 1 [mt; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mt/#colossal-oscar-1-mt-2021-49","title":"Colossal OSCAR 1 [mt; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_mt</code> Title: Colossal OSCAR 1 [mt; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mt/#colossal-oscar-1-mt-2022-27","title":"Colossal OSCAR 1 [mt; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_mt</code> Title: Colossal OSCAR 1 [mt; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mt/#colossal-oscar-1-mt-2022-49","title":"Colossal OSCAR 1 [mt; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_mt</code> Title: Colossal OSCAR 1 [mt; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mt/#colossal-oscar-1-mt-2023-14","title":"Colossal OSCAR 1 [mt; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_mt</code> Title: Colossal OSCAR 1 [mt; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mt/#colossal-oscar-1-mt-2023-23","title":"Colossal OSCAR 1 [mt; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_mt</code> Title: Colossal OSCAR 1 [mt; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 193 k"},{"location":"datasets/language_mt/#eurlexresources-mt","title":"EurlexResources [mt]","text":"Dataset ID: <code>eurlex_mt</code> Title: EurlexResources [mt] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 4 B"},{"location":"datasets/language_mt/#korpus-malti","title":"Korpus Malti","text":"Dataset ID: <code>korpus_malti</code> Title: Korpus Malti Description: General Corpora for the Maltese Language. This dataset is composed of texts from various genres/domains written in Maltese. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/MLRS/korpus_malti] License: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License (DFKI has a permission for LLM training with commercial license) (commercial use: False, sharealike: True) Tokens: 366 M"},{"location":"datasets/language_mt/#legalmc4-mt","title":"LegalMC4 [mt]","text":"Dataset ID: <code>legal_mc4_mt</code> Title: LegalMC4 [mt] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 3 M"},{"location":"datasets/language_mt/#macocu-web-corpus-maltese-20","title":"MaCoCu web corpus [Maltese 2.0]","text":"Dataset ID: <code>macocu_mt</code> Title: MaCoCu web corpus [Maltese 2.0] Description: MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/ Availibility: <code>direct_download</code> Homepage: [https://www.clarin.si/repository/xmlui/handle/11356/1803] License: CC0-No Rights Reserved (commercial use: True, sharealike: False) Tokens: 348 M"},{"location":"datasets/language_mt/#wikipedia-mt","title":"Wikipedia [mt]","text":"Dataset ID: <code>wiki_mt</code> Title: Wikipedia [mt] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 6 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_multi/","title":"Multi Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Multi language.</p>"},{"location":"datasets/language_multi/#colossal-oscar-1-multi-2015-14","title":"Colossal OSCAR 1 [multi; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_multi</code> Title: Colossal OSCAR 1 [multi; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_multi/#colossal-oscar-1-multi-2016-40","title":"Colossal OSCAR 1 [multi; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_multi</code> Title: Colossal OSCAR 1 [multi; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_multi/#colossal-oscar-1-multi-2017-43","title":"Colossal OSCAR 1 [multi; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_multi</code> Title: Colossal OSCAR 1 [multi; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_multi/#colossal-oscar-1-multi-2018-47","title":"Colossal OSCAR 1 [multi; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_multi</code> Title: Colossal OSCAR 1 [multi; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_multi/#colossal-oscar-1-multi-2019-22","title":"Colossal OSCAR 1 [multi; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_multi</code> Title: Colossal OSCAR 1 [multi; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_multi/#colossal-oscar-1-multi-2020-24","title":"Colossal OSCAR 1 [multi; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_multi</code> Title: Colossal OSCAR 1 [multi; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_multi/#colossal-oscar-1-multi-2020-45","title":"Colossal OSCAR 1 [multi; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_multi</code> Title: Colossal OSCAR 1 [multi; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_multi/#colossal-oscar-1-multi-2021-49","title":"Colossal OSCAR 1 [multi; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_multi</code> Title: Colossal OSCAR 1 [multi; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_multi/#colossal-oscar-1-multi-2022-27","title":"Colossal OSCAR 1 [multi; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_multi</code> Title: Colossal OSCAR 1 [multi; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_multi/#colossal-oscar-1-multi-2022-49","title":"Colossal OSCAR 1 [multi; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_multi</code> Title: Colossal OSCAR 1 [multi; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_multi/#colossal-oscar-1-multi-2023-14","title":"Colossal OSCAR 1 [multi; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_multi</code> Title: Colossal OSCAR 1 [multi; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_multi/#colossal-oscar-1-multi-2023-23","title":"Colossal OSCAR 1 [multi; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_multi</code> Title: Colossal OSCAR 1 [multi; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_mwl/","title":"Mwl Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Mwl language.</p>"},{"location":"datasets/language_mwl/#colossal-oscar-1-mwl-2015-14","title":"Colossal OSCAR 1 [mwl; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_mwl</code> Title: Colossal OSCAR 1 [mwl; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mwl/#colossal-oscar-1-mwl-2016-40","title":"Colossal OSCAR 1 [mwl; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_mwl</code> Title: Colossal OSCAR 1 [mwl; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mwl/#colossal-oscar-1-mwl-2017-43","title":"Colossal OSCAR 1 [mwl; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_mwl</code> Title: Colossal OSCAR 1 [mwl; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mwl/#colossal-oscar-1-mwl-2018-47","title":"Colossal OSCAR 1 [mwl; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_mwl</code> Title: Colossal OSCAR 1 [mwl; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mwl/#colossal-oscar-1-mwl-2019-22","title":"Colossal OSCAR 1 [mwl; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_mwl</code> Title: Colossal OSCAR 1 [mwl; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mwl/#colossal-oscar-1-mwl-2020-24","title":"Colossal OSCAR 1 [mwl; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_mwl</code> Title: Colossal OSCAR 1 [mwl; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mwl/#colossal-oscar-1-mwl-2020-45","title":"Colossal OSCAR 1 [mwl; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_mwl</code> Title: Colossal OSCAR 1 [mwl; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mwl/#colossal-oscar-1-mwl-2021-49","title":"Colossal OSCAR 1 [mwl; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_mwl</code> Title: Colossal OSCAR 1 [mwl; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mwl/#colossal-oscar-1-mwl-2022-27","title":"Colossal OSCAR 1 [mwl; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_mwl</code> Title: Colossal OSCAR 1 [mwl; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mwl/#colossal-oscar-1-mwl-2022-49","title":"Colossal OSCAR 1 [mwl; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_mwl</code> Title: Colossal OSCAR 1 [mwl; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mwl/#colossal-oscar-1-mwl-2023-14","title":"Colossal OSCAR 1 [mwl; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_mwl</code> Title: Colossal OSCAR 1 [mwl; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mwl/#colossal-oscar-1-mwl-2023-23","title":"Colossal OSCAR 1 [mwl; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_mwl</code> Title: Colossal OSCAR 1 [mwl; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_my/","title":"Burmese Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Burmese language.</p>"},{"location":"datasets/language_my/#colossal-oscar-1-my-2015-14","title":"Colossal OSCAR 1 [my; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_my</code> Title: Colossal OSCAR 1 [my; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_my/#colossal-oscar-1-my-2016-40","title":"Colossal OSCAR 1 [my; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_my</code> Title: Colossal OSCAR 1 [my; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_my/#colossal-oscar-1-my-2017-43","title":"Colossal OSCAR 1 [my; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_my</code> Title: Colossal OSCAR 1 [my; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_my/#colossal-oscar-1-my-2018-47","title":"Colossal OSCAR 1 [my; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_my</code> Title: Colossal OSCAR 1 [my; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_my/#colossal-oscar-1-my-2019-22","title":"Colossal OSCAR 1 [my; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_my</code> Title: Colossal OSCAR 1 [my; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_my/#colossal-oscar-1-my-2020-24","title":"Colossal OSCAR 1 [my; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_my</code> Title: Colossal OSCAR 1 [my; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_my/#colossal-oscar-1-my-2020-45","title":"Colossal OSCAR 1 [my; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_my</code> Title: Colossal OSCAR 1 [my; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_my/#colossal-oscar-1-my-2021-49","title":"Colossal OSCAR 1 [my; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_my</code> Title: Colossal OSCAR 1 [my; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_my/#colossal-oscar-1-my-2022-27","title":"Colossal OSCAR 1 [my; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_my</code> Title: Colossal OSCAR 1 [my; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_my/#colossal-oscar-1-my-2022-49","title":"Colossal OSCAR 1 [my; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_my</code> Title: Colossal OSCAR 1 [my; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_my/#colossal-oscar-1-my-2023-14","title":"Colossal OSCAR 1 [my; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_my</code> Title: Colossal OSCAR 1 [my; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_my/#colossal-oscar-1-my-2023-23","title":"Colossal OSCAR 1 [my; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_my</code> Title: Colossal OSCAR 1 [my; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_mzn/","title":"Mzn Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Mzn language.</p>"},{"location":"datasets/language_mzn/#colossal-oscar-1-mzn-2015-14","title":"Colossal OSCAR 1 [mzn; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_mzn</code> Title: Colossal OSCAR 1 [mzn; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mzn/#colossal-oscar-1-mzn-2016-40","title":"Colossal OSCAR 1 [mzn; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_mzn</code> Title: Colossal OSCAR 1 [mzn; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mzn/#colossal-oscar-1-mzn-2017-43","title":"Colossal OSCAR 1 [mzn; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_mzn</code> Title: Colossal OSCAR 1 [mzn; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mzn/#colossal-oscar-1-mzn-2018-47","title":"Colossal OSCAR 1 [mzn; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_mzn</code> Title: Colossal OSCAR 1 [mzn; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mzn/#colossal-oscar-1-mzn-2019-22","title":"Colossal OSCAR 1 [mzn; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_mzn</code> Title: Colossal OSCAR 1 [mzn; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mzn/#colossal-oscar-1-mzn-2020-24","title":"Colossal OSCAR 1 [mzn; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_mzn</code> Title: Colossal OSCAR 1 [mzn; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mzn/#colossal-oscar-1-mzn-2020-45","title":"Colossal OSCAR 1 [mzn; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_mzn</code> Title: Colossal OSCAR 1 [mzn; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mzn/#colossal-oscar-1-mzn-2021-49","title":"Colossal OSCAR 1 [mzn; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_mzn</code> Title: Colossal OSCAR 1 [mzn; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mzn/#colossal-oscar-1-mzn-2022-27","title":"Colossal OSCAR 1 [mzn; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_mzn</code> Title: Colossal OSCAR 1 [mzn; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mzn/#colossal-oscar-1-mzn-2022-49","title":"Colossal OSCAR 1 [mzn; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_mzn</code> Title: Colossal OSCAR 1 [mzn; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mzn/#colossal-oscar-1-mzn-2023-14","title":"Colossal OSCAR 1 [mzn; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_mzn</code> Title: Colossal OSCAR 1 [mzn; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_mzn/#colossal-oscar-1-mzn-2023-23","title":"Colossal OSCAR 1 [mzn; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_mzn</code> Title: Colossal OSCAR 1 [mzn; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_nah/","title":"Nah Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Nah language.</p>"},{"location":"datasets/language_nah/#colossal-oscar-1-nah-2015-14","title":"Colossal OSCAR 1 [nah; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_nah</code> Title: Colossal OSCAR 1 [nah; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nah/#colossal-oscar-1-nah-2016-40","title":"Colossal OSCAR 1 [nah; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_nah</code> Title: Colossal OSCAR 1 [nah; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nah/#colossal-oscar-1-nah-2017-43","title":"Colossal OSCAR 1 [nah; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_nah</code> Title: Colossal OSCAR 1 [nah; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nah/#colossal-oscar-1-nah-2018-47","title":"Colossal OSCAR 1 [nah; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_nah</code> Title: Colossal OSCAR 1 [nah; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nah/#colossal-oscar-1-nah-2019-22","title":"Colossal OSCAR 1 [nah; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_nah</code> Title: Colossal OSCAR 1 [nah; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nah/#colossal-oscar-1-nah-2020-24","title":"Colossal OSCAR 1 [nah; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_nah</code> Title: Colossal OSCAR 1 [nah; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nah/#colossal-oscar-1-nah-2020-45","title":"Colossal OSCAR 1 [nah; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_nah</code> Title: Colossal OSCAR 1 [nah; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nah/#colossal-oscar-1-nah-2021-49","title":"Colossal OSCAR 1 [nah; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_nah</code> Title: Colossal OSCAR 1 [nah; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nah/#colossal-oscar-1-nah-2022-27","title":"Colossal OSCAR 1 [nah; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_nah</code> Title: Colossal OSCAR 1 [nah; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nah/#colossal-oscar-1-nah-2022-49","title":"Colossal OSCAR 1 [nah; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_nah</code> Title: Colossal OSCAR 1 [nah; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nah/#colossal-oscar-1-nah-2023-14","title":"Colossal OSCAR 1 [nah; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_nah</code> Title: Colossal OSCAR 1 [nah; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nah/#colossal-oscar-1-nah-2023-23","title":"Colossal OSCAR 1 [nah; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_nah</code> Title: Colossal OSCAR 1 [nah; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_nds/","title":"Nds Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Nds language.</p>"},{"location":"datasets/language_nds/#colossal-oscar-1-nds-2015-14","title":"Colossal OSCAR 1 [nds; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_nds</code> Title: Colossal OSCAR 1 [nds; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nds/#colossal-oscar-1-nds-2016-40","title":"Colossal OSCAR 1 [nds; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_nds</code> Title: Colossal OSCAR 1 [nds; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nds/#colossal-oscar-1-nds-2017-43","title":"Colossal OSCAR 1 [nds; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_nds</code> Title: Colossal OSCAR 1 [nds; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nds/#colossal-oscar-1-nds-2018-47","title":"Colossal OSCAR 1 [nds; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_nds</code> Title: Colossal OSCAR 1 [nds; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nds/#colossal-oscar-1-nds-2019-22","title":"Colossal OSCAR 1 [nds; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_nds</code> Title: Colossal OSCAR 1 [nds; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nds/#colossal-oscar-1-nds-2020-24","title":"Colossal OSCAR 1 [nds; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_nds</code> Title: Colossal OSCAR 1 [nds; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nds/#colossal-oscar-1-nds-2020-45","title":"Colossal OSCAR 1 [nds; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_nds</code> Title: Colossal OSCAR 1 [nds; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nds/#colossal-oscar-1-nds-2021-49","title":"Colossal OSCAR 1 [nds; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_nds</code> Title: Colossal OSCAR 1 [nds; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nds/#colossal-oscar-1-nds-2022-27","title":"Colossal OSCAR 1 [nds; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_nds</code> Title: Colossal OSCAR 1 [nds; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nds/#colossal-oscar-1-nds-2022-49","title":"Colossal OSCAR 1 [nds; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_nds</code> Title: Colossal OSCAR 1 [nds; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nds/#colossal-oscar-1-nds-2023-14","title":"Colossal OSCAR 1 [nds; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_nds</code> Title: Colossal OSCAR 1 [nds; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nds/#colossal-oscar-1-nds-2023-23","title":"Colossal OSCAR 1 [nds; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_nds</code> Title: Colossal OSCAR 1 [nds; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ne/","title":"Nepali Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Nepali language.</p>"},{"location":"datasets/language_ne/#colossal-oscar-1-ne-2015-14","title":"Colossal OSCAR 1 [ne; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ne</code> Title: Colossal OSCAR 1 [ne; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ne/#colossal-oscar-1-ne-2016-40","title":"Colossal OSCAR 1 [ne; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ne</code> Title: Colossal OSCAR 1 [ne; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ne/#colossal-oscar-1-ne-2017-43","title":"Colossal OSCAR 1 [ne; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ne</code> Title: Colossal OSCAR 1 [ne; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ne/#colossal-oscar-1-ne-2018-47","title":"Colossal OSCAR 1 [ne; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ne</code> Title: Colossal OSCAR 1 [ne; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ne/#colossal-oscar-1-ne-2019-22","title":"Colossal OSCAR 1 [ne; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ne</code> Title: Colossal OSCAR 1 [ne; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ne/#colossal-oscar-1-ne-2020-24","title":"Colossal OSCAR 1 [ne; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ne</code> Title: Colossal OSCAR 1 [ne; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ne/#colossal-oscar-1-ne-2020-45","title":"Colossal OSCAR 1 [ne; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ne</code> Title: Colossal OSCAR 1 [ne; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ne/#colossal-oscar-1-ne-2021-49","title":"Colossal OSCAR 1 [ne; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ne</code> Title: Colossal OSCAR 1 [ne; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ne/#colossal-oscar-1-ne-2022-27","title":"Colossal OSCAR 1 [ne; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ne</code> Title: Colossal OSCAR 1 [ne; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ne/#colossal-oscar-1-ne-2022-49","title":"Colossal OSCAR 1 [ne; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ne</code> Title: Colossal OSCAR 1 [ne; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ne/#colossal-oscar-1-ne-2023-14","title":"Colossal OSCAR 1 [ne; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ne</code> Title: Colossal OSCAR 1 [ne; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ne/#colossal-oscar-1-ne-2023-23","title":"Colossal OSCAR 1 [ne; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ne</code> Title: Colossal OSCAR 1 [ne; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_new/","title":"New Datasets","text":"<p>There are in total 12 datasets with N/A tokens in New language.</p>"},{"location":"datasets/language_new/#colossal-oscar-1-new-2015-14","title":"Colossal OSCAR 1 [new; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_new</code> Title: Colossal OSCAR 1 [new; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_new/#colossal-oscar-1-new-2016-40","title":"Colossal OSCAR 1 [new; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_new</code> Title: Colossal OSCAR 1 [new; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_new/#colossal-oscar-1-new-2017-43","title":"Colossal OSCAR 1 [new; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_new</code> Title: Colossal OSCAR 1 [new; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_new/#colossal-oscar-1-new-2018-47","title":"Colossal OSCAR 1 [new; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_new</code> Title: Colossal OSCAR 1 [new; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_new/#colossal-oscar-1-new-2019-22","title":"Colossal OSCAR 1 [new; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_new</code> Title: Colossal OSCAR 1 [new; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_new/#colossal-oscar-1-new-2020-24","title":"Colossal OSCAR 1 [new; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_new</code> Title: Colossal OSCAR 1 [new; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_new/#colossal-oscar-1-new-2020-45","title":"Colossal OSCAR 1 [new; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_new</code> Title: Colossal OSCAR 1 [new; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_new/#colossal-oscar-1-new-2021-49","title":"Colossal OSCAR 1 [new; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_new</code> Title: Colossal OSCAR 1 [new; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_new/#colossal-oscar-1-new-2022-27","title":"Colossal OSCAR 1 [new; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_new</code> Title: Colossal OSCAR 1 [new; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_new/#colossal-oscar-1-new-2022-49","title":"Colossal OSCAR 1 [new; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_new</code> Title: Colossal OSCAR 1 [new; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_new/#colossal-oscar-1-new-2023-14","title":"Colossal OSCAR 1 [new; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_new</code> Title: Colossal OSCAR 1 [new; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_new/#colossal-oscar-1-new-2023-23","title":"Colossal OSCAR 1 [new; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_new</code> Title: Colossal OSCAR 1 [new; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_nl/","title":"Dutch Datasets","text":"<p>There are in total 26 datasets with 26 B tokens in Dutch language.</p>"},{"location":"datasets/language_nl/#colossal-oscar-1-nl-2015-14","title":"Colossal OSCAR 1 [nl; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_nl</code> Title: Colossal OSCAR 1 [nl; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#colossal-oscar-1-nl-2016-40","title":"Colossal OSCAR 1 [nl; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_nl</code> Title: Colossal OSCAR 1 [nl; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#colossal-oscar-1-nl-2017-43","title":"Colossal OSCAR 1 [nl; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_nl</code> Title: Colossal OSCAR 1 [nl; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#colossal-oscar-1-nl-2018-47","title":"Colossal OSCAR 1 [nl; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_nl</code> Title: Colossal OSCAR 1 [nl; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#colossal-oscar-1-nl-2019-22","title":"Colossal OSCAR 1 [nl; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_nl</code> Title: Colossal OSCAR 1 [nl; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#colossal-oscar-1-nl-2020-24","title":"Colossal OSCAR 1 [nl; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_nl</code> Title: Colossal OSCAR 1 [nl; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#colossal-oscar-1-nl-2020-45","title":"Colossal OSCAR 1 [nl; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_nl</code> Title: Colossal OSCAR 1 [nl; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#colossal-oscar-1-nl-2021-49","title":"Colossal OSCAR 1 [nl; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_nl</code> Title: Colossal OSCAR 1 [nl; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#colossal-oscar-1-nl-2022-27","title":"Colossal OSCAR 1 [nl; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_nl</code> Title: Colossal OSCAR 1 [nl; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#colossal-oscar-1-nl-2022-49","title":"Colossal OSCAR 1 [nl; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_nl</code> Title: Colossal OSCAR 1 [nl; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#colossal-oscar-1-nl-2023-14","title":"Colossal OSCAR 1 [nl; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_nl</code> Title: Colossal OSCAR 1 [nl; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#colossal-oscar-1-nl-2023-23","title":"Colossal OSCAR 1 [nl; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_nl</code> Title: Colossal OSCAR 1 [nl; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 17 B"},{"location":"datasets/language_nl/#eurlexresources-nl","title":"EurlexResources [nl]","text":"Dataset ID: <code>eurlex_nl</code> Title: EurlexResources [nl] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 8 B"},{"location":"datasets/language_nl/#legalmc4-nl","title":"LegalMC4 [nl]","text":"Dataset ID: <code>legal_mc4_nl</code> Title: LegalMC4 [nl] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 22 M"},{"location":"datasets/language_nl/#sonar-corpus-nc-12","title":"SoNaR Corpus NC 1.2","text":"Dataset ID: <code>sonar_subtitles</code> Title: SoNaR Corpus NC 1.2 Description: The SoNaR Corpus contains more than 500 million words from texts in standard Dutch later than 1954. All texts were tokenized, tagged for part of speech and lemmatized. The named entities were also labelled. All annotations were produced automatically, no manual verification took place. Availibility: <code>signin_download</code> Homepage: [https://taalmaterialen.ivdnt.org/download/tstc-sonar-corpus/] License: unknown, likely research only or fair use (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#sonar-corpus-nc-12_1","title":"SoNaR Corpus NC 1.2","text":"Dataset ID: <code>sonar_edu</code> Title: SoNaR Corpus NC 1.2 Description: The SoNaR Corpus contains more than 500 million words from texts in standard Dutch later than 1954. All texts were tokenized, tagged for part of speech and lemmatized. The named entities were also labelled. All annotations were produced automatically, no manual verification took place. Availibility: <code>signin_download</code> Homepage: [https://taalmaterialen.ivdnt.org/download/tstc-sonar-corpus/] License: unknown, likely research only or fair use (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#sonar-corpus-nc-12_2","title":"SoNaR Corpus NC 1.2","text":"Dataset ID: <code>sonar_news</code> Title: SoNaR Corpus NC 1.2 Description: The SoNaR Corpus contains more than 500 million words from texts in standard Dutch later than 1954. All texts were tokenized, tagged for part of speech and lemmatized. The named entities were also labelled. All annotations were produced automatically, no manual verification took place. Availibility: <code>signin_download</code> Homepage: [https://taalmaterialen.ivdnt.org/download/tstc-sonar-corpus/] License: unknown, likely research only or fair use (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#sonar-corpus-nc-12_3","title":"SoNaR Corpus NC 1.2","text":"Dataset ID: <code>sonar_books</code> Title: SoNaR Corpus NC 1.2 Description: The SoNaR Corpus contains more than 500 million words from texts in standard Dutch later than 1954. All texts were tokenized, tagged for part of speech and lemmatized. The named entities were also labelled. All annotations were produced automatically, no manual verification took place. Availibility: <code>signin_download</code> Homepage: [https://taalmaterialen.ivdnt.org/download/tstc-sonar-corpus/] License: unknown, likely research only or fair use (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#sonar-corpus-nc-12_4","title":"SoNaR Corpus NC 1.2","text":"Dataset ID: <code>sonar_gov</code> Title: SoNaR Corpus NC 1.2 Description: The SoNaR Corpus contains more than 500 million words from texts in standard Dutch later than 1954. All texts were tokenized, tagged for part of speech and lemmatized. The named entities were also labelled. All annotations were produced automatically, no manual verification took place. Availibility: <code>signin_download</code> Homepage: [https://taalmaterialen.ivdnt.org/download/tstc-sonar-corpus/] License: unknown, likely research only or fair use (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nl/#sonar-corpus-nc-12_5","title":"SoNaR Corpus NC 1.2","text":"Dataset ID: <code>sonar_web</code> Title: SoNaR Corpus NC 1.2 Description: The SoNaR Corpus contains more than 500 million words from texts in standard Dutch later than 1954. All texts were tokenized, tagged for part of speech and lemmatized. The named entities were also labelled. All annotations were produced automatically, no manual verification took place. Availibility: <code>signin_download</code> Homepage: [https://taalmaterialen.ivdnt.org/download/tstc-sonar-corpus/] License: unknown, likely research only or fair use (commercial use: None, sharealike: None) Tokens: 500 M"},{"location":"datasets/language_nl/#wikibooks-nl","title":"Wikibooks [nl]","text":"Dataset ID: <code>wikibooks_nl</code> Title: Wikibooks [nl] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 8 M"},{"location":"datasets/language_nl/#wikinews-nl","title":"Wikinews [nl]","text":"Dataset ID: <code>wikinews_nl</code> Title: Wikinews [nl] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 3 M"},{"location":"datasets/language_nl/#wikipedia-nl","title":"Wikipedia [nl]","text":"Dataset ID: <code>wiki_nl</code> Title: Wikipedia [nl] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 263 M"},{"location":"datasets/language_nl/#wikiquote-nl","title":"Wikiquote [nl]","text":"Dataset ID: <code>wikiquote_nl</code> Title: Wikiquote [nl] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 75 k"},{"location":"datasets/language_nl/#wikisource-nl","title":"Wikisource [nl]","text":"Dataset ID: <code>wikisource_nl</code> Title: Wikisource [nl] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 16 M"},{"location":"datasets/language_nl/#wikivoyage-nl","title":"Wikivoyage [nl]","text":"Dataset ID: <code>wikivoyage_nl</code> Title: Wikivoyage [nl] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 3 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_nn/","title":"Norwegian Nynorsk Datasets","text":"<p>There are in total 15 datasets with 301 M tokens in Norwegian Nynorsk language.</p>"},{"location":"datasets/language_nn/#colossal-oscar-1-nn-2015-14","title":"Colossal OSCAR 1 [nn; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_nn</code> Title: Colossal OSCAR 1 [nn; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nn/#colossal-oscar-1-nn-2016-40","title":"Colossal OSCAR 1 [nn; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_nn</code> Title: Colossal OSCAR 1 [nn; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nn/#colossal-oscar-1-nn-2017-43","title":"Colossal OSCAR 1 [nn; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_nn</code> Title: Colossal OSCAR 1 [nn; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nn/#colossal-oscar-1-nn-2018-47","title":"Colossal OSCAR 1 [nn; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_nn</code> Title: Colossal OSCAR 1 [nn; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nn/#colossal-oscar-1-nn-2019-22","title":"Colossal OSCAR 1 [nn; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_nn</code> Title: Colossal OSCAR 1 [nn; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nn/#colossal-oscar-1-nn-2020-24","title":"Colossal OSCAR 1 [nn; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_nn</code> Title: Colossal OSCAR 1 [nn; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nn/#colossal-oscar-1-nn-2020-45","title":"Colossal OSCAR 1 [nn; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_nn</code> Title: Colossal OSCAR 1 [nn; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nn/#colossal-oscar-1-nn-2021-49","title":"Colossal OSCAR 1 [nn; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_nn</code> Title: Colossal OSCAR 1 [nn; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nn/#colossal-oscar-1-nn-2022-27","title":"Colossal OSCAR 1 [nn; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_nn</code> Title: Colossal OSCAR 1 [nn; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nn/#colossal-oscar-1-nn-2022-49","title":"Colossal OSCAR 1 [nn; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_nn</code> Title: Colossal OSCAR 1 [nn; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nn/#colossal-oscar-1-nn-2023-14","title":"Colossal OSCAR 1 [nn; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_nn</code> Title: Colossal OSCAR 1 [nn; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nn/#colossal-oscar-1-nn-2023-23","title":"Colossal OSCAR 1 [nn; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_nn</code> Title: Colossal OSCAR 1 [nn; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 857 k"},{"location":"datasets/language_nn/#norwegian-colossal-corpus","title":"Norwegian Colossal Corpus","text":"Dataset ID: <code>norwegian_cc_nn</code> Title: Norwegian Colossal Corpus Description: The Norwegian Colossal Corpus is a collection of multiple smaller Norwegian corpuses suitable for training large language models. We have done extensive cleaning on the datasets, and have made them available in a common format. The total size of the NCC is currently 45GB. Documents: 20,830,348; Words/document: 331 Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/NbAiLab/NCC] License: mixed (NLOD 2.0, CC0 1.0, CC BY-NC 2.0, CC BY-SA 3.0) (commercial use: False, sharealike: None) Tokens: 300 M"},{"location":"datasets/language_nn/#wikipedia-nn","title":"Wikipedia [nn]","text":"Dataset ID: <code>wiki_nn</code> Title: Wikipedia [nn] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_nn/#wikiquote-nn","title":"Wikiquote [nn]","text":"Dataset ID: <code>wikiquote_nn</code> Title: Wikiquote [nn] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_no/","title":"Norwegian Datasets","text":"<p>There are in total 19 datasets with 5 B tokens in Norwegian language.</p>"},{"location":"datasets/language_no/#colossal-oscar-1-no-2015-14","title":"Colossal OSCAR 1 [no; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_no</code> Title: Colossal OSCAR 1 [no; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#colossal-oscar-1-no-2016-40","title":"Colossal OSCAR 1 [no; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_no</code> Title: Colossal OSCAR 1 [no; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#colossal-oscar-1-no-2017-43","title":"Colossal OSCAR 1 [no; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_no</code> Title: Colossal OSCAR 1 [no; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#colossal-oscar-1-no-2018-47","title":"Colossal OSCAR 1 [no; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_no</code> Title: Colossal OSCAR 1 [no; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#colossal-oscar-1-no-2019-22","title":"Colossal OSCAR 1 [no; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_no</code> Title: Colossal OSCAR 1 [no; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#colossal-oscar-1-no-2020-24","title":"Colossal OSCAR 1 [no; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_no</code> Title: Colossal OSCAR 1 [no; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#colossal-oscar-1-no-2020-45","title":"Colossal OSCAR 1 [no; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_no</code> Title: Colossal OSCAR 1 [no; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#colossal-oscar-1-no-2021-49","title":"Colossal OSCAR 1 [no; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_no</code> Title: Colossal OSCAR 1 [no; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#colossal-oscar-1-no-2022-27","title":"Colossal OSCAR 1 [no; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_no</code> Title: Colossal OSCAR 1 [no; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#colossal-oscar-1-no-2022-49","title":"Colossal OSCAR 1 [no; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_no</code> Title: Colossal OSCAR 1 [no; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#colossal-oscar-1-no-2023-14","title":"Colossal OSCAR 1 [no; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_no</code> Title: Colossal OSCAR 1 [no; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#colossal-oscar-1-no-2023-23","title":"Colossal OSCAR 1 [no; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_no</code> Title: Colossal OSCAR 1 [no; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 345 M"},{"location":"datasets/language_no/#norwegian-colossal-corpus","title":"Norwegian Colossal Corpus","text":"Dataset ID: <code>norwegian_cc_no</code> Title: Norwegian Colossal Corpus Description: The Norwegian Colossal Corpus is a collection of multiple smaller Norwegian corpuses suitable for training large language models. We have done extensive cleaning on the datasets, and have made them available in a common format. The total size of the NCC is currently 45GB. Documents: 20,830,348; Words/document: 331 Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/NbAiLab/NCC] License: mixed (NLOD 2.0, CC0 1.0, CC BY-NC 2.0, CC BY-SA 3.0) (commercial use: False, sharealike: None) Tokens: 5 B"},{"location":"datasets/language_no/#wikibooks-no","title":"Wikibooks [no]","text":"Dataset ID: <code>wikibooks_no</code> Title: Wikibooks [no] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#wikinews-no","title":"Wikinews [no]","text":"Dataset ID: <code>wikinews_no</code> Title: Wikinews [no] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#wikipedia-no","title":"Wikipedia [no]","text":"Dataset ID: <code>wiki_no</code> Title: Wikipedia [no] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#wikiquote-no","title":"Wikiquote [no]","text":"Dataset ID: <code>wikiquote_no</code> Title: Wikiquote [no] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#wikisource-no","title":"Wikisource [no]","text":"Dataset ID: <code>wikisource_no</code> Title: Wikisource [no] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_no/#wikivoyage-no","title":"Wikivoyage [no]","text":"Dataset ID: <code>wikivoyage_no</code> Title: Wikivoyage [no] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ny/","title":"Chichewa Datasets","text":"<p>There are in total 1 datasets with N/A tokens in Chichewa language.</p>"},{"location":"datasets/language_ny/#wura-chichewa","title":"WURA [Chichewa]","text":"Dataset ID: <code>wura_ny</code> Title: WURA [Chichewa] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_oc/","title":"Occitan Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Occitan language.</p>"},{"location":"datasets/language_oc/#colossal-oscar-1-oc-2015-14","title":"Colossal OSCAR 1 [oc; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_oc</code> Title: Colossal OSCAR 1 [oc; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_oc/#colossal-oscar-1-oc-2016-40","title":"Colossal OSCAR 1 [oc; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_oc</code> Title: Colossal OSCAR 1 [oc; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_oc/#colossal-oscar-1-oc-2017-43","title":"Colossal OSCAR 1 [oc; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_oc</code> Title: Colossal OSCAR 1 [oc; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_oc/#colossal-oscar-1-oc-2018-47","title":"Colossal OSCAR 1 [oc; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_oc</code> Title: Colossal OSCAR 1 [oc; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_oc/#colossal-oscar-1-oc-2019-22","title":"Colossal OSCAR 1 [oc; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_oc</code> Title: Colossal OSCAR 1 [oc; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_oc/#colossal-oscar-1-oc-2020-24","title":"Colossal OSCAR 1 [oc; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_oc</code> Title: Colossal OSCAR 1 [oc; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_oc/#colossal-oscar-1-oc-2020-45","title":"Colossal OSCAR 1 [oc; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_oc</code> Title: Colossal OSCAR 1 [oc; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_oc/#colossal-oscar-1-oc-2021-49","title":"Colossal OSCAR 1 [oc; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_oc</code> Title: Colossal OSCAR 1 [oc; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_oc/#colossal-oscar-1-oc-2022-27","title":"Colossal OSCAR 1 [oc; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_oc</code> Title: Colossal OSCAR 1 [oc; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_oc/#colossal-oscar-1-oc-2022-49","title":"Colossal OSCAR 1 [oc; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_oc</code> Title: Colossal OSCAR 1 [oc; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_oc/#colossal-oscar-1-oc-2023-14","title":"Colossal OSCAR 1 [oc; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_oc</code> Title: Colossal OSCAR 1 [oc; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_oc/#colossal-oscar-1-oc-2023-23","title":"Colossal OSCAR 1 [oc; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_oc</code> Title: Colossal OSCAR 1 [oc; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_om/","title":"Oromo Datasets","text":"<p>There are in total 1 datasets with N/A tokens in Oromo language.</p>"},{"location":"datasets/language_om/#wura-oromo","title":"WURA [Oromo]","text":"Dataset ID: <code>wura_om</code> Title: WURA [Oromo] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_or/","title":"Oriya Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Oriya language.</p>"},{"location":"datasets/language_or/#colossal-oscar-1-or-2015-14","title":"Colossal OSCAR 1 [or; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_or</code> Title: Colossal OSCAR 1 [or; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_or/#colossal-oscar-1-or-2016-40","title":"Colossal OSCAR 1 [or; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_or</code> Title: Colossal OSCAR 1 [or; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_or/#colossal-oscar-1-or-2017-43","title":"Colossal OSCAR 1 [or; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_or</code> Title: Colossal OSCAR 1 [or; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_or/#colossal-oscar-1-or-2018-47","title":"Colossal OSCAR 1 [or; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_or</code> Title: Colossal OSCAR 1 [or; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_or/#colossal-oscar-1-or-2019-22","title":"Colossal OSCAR 1 [or; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_or</code> Title: Colossal OSCAR 1 [or; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_or/#colossal-oscar-1-or-2020-24","title":"Colossal OSCAR 1 [or; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_or</code> Title: Colossal OSCAR 1 [or; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_or/#colossal-oscar-1-or-2020-45","title":"Colossal OSCAR 1 [or; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_or</code> Title: Colossal OSCAR 1 [or; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_or/#colossal-oscar-1-or-2021-49","title":"Colossal OSCAR 1 [or; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_or</code> Title: Colossal OSCAR 1 [or; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_or/#colossal-oscar-1-or-2022-27","title":"Colossal OSCAR 1 [or; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_or</code> Title: Colossal OSCAR 1 [or; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_or/#colossal-oscar-1-or-2022-49","title":"Colossal OSCAR 1 [or; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_or</code> Title: Colossal OSCAR 1 [or; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_or/#colossal-oscar-1-or-2023-14","title":"Colossal OSCAR 1 [or; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_or</code> Title: Colossal OSCAR 1 [or; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_or/#colossal-oscar-1-or-2023-23","title":"Colossal OSCAR 1 [or; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_or</code> Title: Colossal OSCAR 1 [or; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_os/","title":"Ossetian Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Ossetian language.</p>"},{"location":"datasets/language_os/#colossal-oscar-1-os-2015-14","title":"Colossal OSCAR 1 [os; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_os</code> Title: Colossal OSCAR 1 [os; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_os/#colossal-oscar-1-os-2016-40","title":"Colossal OSCAR 1 [os; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_os</code> Title: Colossal OSCAR 1 [os; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_os/#colossal-oscar-1-os-2017-43","title":"Colossal OSCAR 1 [os; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_os</code> Title: Colossal OSCAR 1 [os; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_os/#colossal-oscar-1-os-2018-47","title":"Colossal OSCAR 1 [os; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_os</code> Title: Colossal OSCAR 1 [os; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_os/#colossal-oscar-1-os-2019-22","title":"Colossal OSCAR 1 [os; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_os</code> Title: Colossal OSCAR 1 [os; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_os/#colossal-oscar-1-os-2020-24","title":"Colossal OSCAR 1 [os; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_os</code> Title: Colossal OSCAR 1 [os; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_os/#colossal-oscar-1-os-2020-45","title":"Colossal OSCAR 1 [os; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_os</code> Title: Colossal OSCAR 1 [os; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_os/#colossal-oscar-1-os-2021-49","title":"Colossal OSCAR 1 [os; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_os</code> Title: Colossal OSCAR 1 [os; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_os/#colossal-oscar-1-os-2022-27","title":"Colossal OSCAR 1 [os; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_os</code> Title: Colossal OSCAR 1 [os; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_os/#colossal-oscar-1-os-2022-49","title":"Colossal OSCAR 1 [os; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_os</code> Title: Colossal OSCAR 1 [os; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_os/#colossal-oscar-1-os-2023-14","title":"Colossal OSCAR 1 [os; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_os</code> Title: Colossal OSCAR 1 [os; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_os/#colossal-oscar-1-os-2023-23","title":"Colossal OSCAR 1 [os; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_os</code> Title: Colossal OSCAR 1 [os; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_pa/","title":"Panjabi Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Panjabi language.</p>"},{"location":"datasets/language_pa/#colossal-oscar-1-pa-2015-14","title":"Colossal OSCAR 1 [pa; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_pa</code> Title: Colossal OSCAR 1 [pa; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pa/#colossal-oscar-1-pa-2016-40","title":"Colossal OSCAR 1 [pa; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_pa</code> Title: Colossal OSCAR 1 [pa; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pa/#colossal-oscar-1-pa-2017-43","title":"Colossal OSCAR 1 [pa; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_pa</code> Title: Colossal OSCAR 1 [pa; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pa/#colossal-oscar-1-pa-2018-47","title":"Colossal OSCAR 1 [pa; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_pa</code> Title: Colossal OSCAR 1 [pa; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pa/#colossal-oscar-1-pa-2019-22","title":"Colossal OSCAR 1 [pa; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_pa</code> Title: Colossal OSCAR 1 [pa; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pa/#colossal-oscar-1-pa-2020-24","title":"Colossal OSCAR 1 [pa; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_pa</code> Title: Colossal OSCAR 1 [pa; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pa/#colossal-oscar-1-pa-2020-45","title":"Colossal OSCAR 1 [pa; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_pa</code> Title: Colossal OSCAR 1 [pa; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pa/#colossal-oscar-1-pa-2021-49","title":"Colossal OSCAR 1 [pa; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_pa</code> Title: Colossal OSCAR 1 [pa; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pa/#colossal-oscar-1-pa-2022-27","title":"Colossal OSCAR 1 [pa; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_pa</code> Title: Colossal OSCAR 1 [pa; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pa/#colossal-oscar-1-pa-2022-49","title":"Colossal OSCAR 1 [pa; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_pa</code> Title: Colossal OSCAR 1 [pa; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pa/#colossal-oscar-1-pa-2023-14","title":"Colossal OSCAR 1 [pa; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_pa</code> Title: Colossal OSCAR 1 [pa; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pa/#colossal-oscar-1-pa-2023-23","title":"Colossal OSCAR 1 [pa; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_pa</code> Title: Colossal OSCAR 1 [pa; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_pl/","title":"Polish Datasets","text":"<p>There are in total 23 datasets with 25 B tokens in Polish language.</p>"},{"location":"datasets/language_pl/#curlicat-corpus-polish","title":"CURLICAT Corpus [Polish]","text":"Dataset ID: <code>curlicat_pl</code> Title: CURLICAT Corpus [Polish] Description: The CURLICAT corpus includes 7 monolingual corpora (Bulgarian, Croatian, Hungarian, Polish, Romanian, Slovak and Slovenian) containing selected samples from respective national corpora. Availibility: <code>direct_download</code> Homepage: [https://elrc-share.eu/repository/browse/curlicat-polish-corpus/f63ae912553911ed9c1a00155d02670648c0a234e0314895b52169af2af57dd7/] License: CC-BY-SA-4.0 (commercial use: None, sharealike: True) Tokens: 59 M"},{"location":"datasets/language_pl/#colossal-oscar-1-pl-2015-14","title":"Colossal OSCAR 1 [pl; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_pl</code> Title: Colossal OSCAR 1 [pl; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pl/#colossal-oscar-1-pl-2016-40","title":"Colossal OSCAR 1 [pl; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_pl</code> Title: Colossal OSCAR 1 [pl; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pl/#colossal-oscar-1-pl-2017-43","title":"Colossal OSCAR 1 [pl; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_pl</code> Title: Colossal OSCAR 1 [pl; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pl/#colossal-oscar-1-pl-2018-47","title":"Colossal OSCAR 1 [pl; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_pl</code> Title: Colossal OSCAR 1 [pl; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pl/#colossal-oscar-1-pl-2019-22","title":"Colossal OSCAR 1 [pl; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_pl</code> Title: Colossal OSCAR 1 [pl; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pl/#colossal-oscar-1-pl-2020-24","title":"Colossal OSCAR 1 [pl; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_pl</code> Title: Colossal OSCAR 1 [pl; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pl/#colossal-oscar-1-pl-2020-45","title":"Colossal OSCAR 1 [pl; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_pl</code> Title: Colossal OSCAR 1 [pl; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pl/#colossal-oscar-1-pl-2021-49","title":"Colossal OSCAR 1 [pl; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_pl</code> Title: Colossal OSCAR 1 [pl; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pl/#colossal-oscar-1-pl-2022-27","title":"Colossal OSCAR 1 [pl; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_pl</code> Title: Colossal OSCAR 1 [pl; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pl/#colossal-oscar-1-pl-2022-49","title":"Colossal OSCAR 1 [pl; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_pl</code> Title: Colossal OSCAR 1 [pl; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pl/#colossal-oscar-1-pl-2023-14","title":"Colossal OSCAR 1 [pl; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_pl</code> Title: Colossal OSCAR 1 [pl; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pl/#colossal-oscar-1-pl-2023-23","title":"Colossal OSCAR 1 [pl; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_pl</code> Title: Colossal OSCAR 1 [pl; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 18 B"},{"location":"datasets/language_pl/#eurlexresources-pl","title":"EurlexResources [pl]","text":"Dataset ID: <code>eurlex_pl</code> Title: EurlexResources [pl] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 4 B"},{"location":"datasets/language_pl/#legalmc4-pl","title":"LegalMC4 [pl]","text":"Dataset ID: <code>legal_mc4_pl</code> Title: LegalMC4 [pl] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 2 B"},{"location":"datasets/language_pl/#nkjp-podkorpusmilionowy-12-national-corpus-of-polish","title":"NKJP-PodkorpusMilionowy-1.2 (National Corpus of Polish)","text":"Dataset ID: <code>pl_nkjp</code> Title: NKJP-PodkorpusMilionowy-1.2 (National Corpus of Polish) Description: A reference corpus of Polish language containing over fifteen hundred millions of words. The list of sources for the corpora contains classic literature, daily newspapers, specialist periodicals and journals, transcripts of conversations, and a variety of short-lived and internet texts. Availibility: <code>direct_download</code> Homepage: [http://clip.ipipan.waw.pl/NationalCorpusOfPolish] License: CC-BY (commercial use: None, sharealike: None) Tokens: 1 M"},{"location":"datasets/language_pl/#polish-parliamentary-corpus-korpus-dyskursu-parlamentarnego","title":"Polish Parliamentary Corpus / Korpus Dyskursu Parlamentarnego","text":"Dataset ID: <code>pl_parliamentary_corpus</code> Title: Polish Parliamentary Corpus / Korpus Dyskursu Parlamentarnego Description: The Polish Parliamentary Corpus (PPC) is a Polish corpus made up of documents from the proceedings of the Polish Parliament, Sejm, and Senate. The corpus includes data of the Polish Sejm corpus and consists of stenographic records of plenary sittings and committee sittings, segments of interpellations and questions. Texts in the PPC corpus cover the period of a hundred years from 1919 to 2019. Availibility: <code>direct_download</code> Homepage: [http://clip.ipipan.waw.pl/PPC] License: CC-BY (commercial use: None, sharealike: None) Tokens: 671 M"},{"location":"datasets/language_pl/#wikibooks-pl","title":"Wikibooks [pl]","text":"Dataset ID: <code>wikibooks_pl</code> Title: Wikibooks [pl] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 10 M"},{"location":"datasets/language_pl/#wikinews-pl","title":"Wikinews [pl]","text":"Dataset ID: <code>wikinews_pl</code> Title: Wikinews [pl] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 9 M"},{"location":"datasets/language_pl/#wikipedia-pl","title":"Wikipedia [pl]","text":"Dataset ID: <code>wiki_pl</code> Title: Wikipedia [pl] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 361 M"},{"location":"datasets/language_pl/#wikiquote-pl","title":"Wikiquote [pl]","text":"Dataset ID: <code>wikiquote_pl</code> Title: Wikiquote [pl] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 29 M"},{"location":"datasets/language_pl/#wikisource-pl","title":"Wikisource [pl]","text":"Dataset ID: <code>wikisource_pl</code> Title: Wikisource [pl] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 19 M"},{"location":"datasets/language_pl/#wikivoyage-pl","title":"Wikivoyage [pl]","text":"Dataset ID: <code>wikivoyage_pl</code> Title: Wikivoyage [pl] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 9 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_pms/","title":"Pms Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Pms language.</p>"},{"location":"datasets/language_pms/#colossal-oscar-1-pms-2015-14","title":"Colossal OSCAR 1 [pms; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_pms</code> Title: Colossal OSCAR 1 [pms; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pms/#colossal-oscar-1-pms-2016-40","title":"Colossal OSCAR 1 [pms; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_pms</code> Title: Colossal OSCAR 1 [pms; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pms/#colossal-oscar-1-pms-2017-43","title":"Colossal OSCAR 1 [pms; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_pms</code> Title: Colossal OSCAR 1 [pms; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pms/#colossal-oscar-1-pms-2018-47","title":"Colossal OSCAR 1 [pms; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_pms</code> Title: Colossal OSCAR 1 [pms; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pms/#colossal-oscar-1-pms-2019-22","title":"Colossal OSCAR 1 [pms; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_pms</code> Title: Colossal OSCAR 1 [pms; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pms/#colossal-oscar-1-pms-2020-24","title":"Colossal OSCAR 1 [pms; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_pms</code> Title: Colossal OSCAR 1 [pms; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pms/#colossal-oscar-1-pms-2020-45","title":"Colossal OSCAR 1 [pms; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_pms</code> Title: Colossal OSCAR 1 [pms; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pms/#colossal-oscar-1-pms-2021-49","title":"Colossal OSCAR 1 [pms; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_pms</code> Title: Colossal OSCAR 1 [pms; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pms/#colossal-oscar-1-pms-2022-27","title":"Colossal OSCAR 1 [pms; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_pms</code> Title: Colossal OSCAR 1 [pms; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pms/#colossal-oscar-1-pms-2022-49","title":"Colossal OSCAR 1 [pms; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_pms</code> Title: Colossal OSCAR 1 [pms; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pms/#colossal-oscar-1-pms-2023-14","title":"Colossal OSCAR 1 [pms; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_pms</code> Title: Colossal OSCAR 1 [pms; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pms/#colossal-oscar-1-pms-2023-23","title":"Colossal OSCAR 1 [pms; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_pms</code> Title: Colossal OSCAR 1 [pms; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_pnb/","title":"Pnb Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Pnb language.</p>"},{"location":"datasets/language_pnb/#colossal-oscar-1-pnb-2015-14","title":"Colossal OSCAR 1 [pnb; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_pnb</code> Title: Colossal OSCAR 1 [pnb; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pnb/#colossal-oscar-1-pnb-2016-40","title":"Colossal OSCAR 1 [pnb; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_pnb</code> Title: Colossal OSCAR 1 [pnb; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pnb/#colossal-oscar-1-pnb-2017-43","title":"Colossal OSCAR 1 [pnb; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_pnb</code> Title: Colossal OSCAR 1 [pnb; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pnb/#colossal-oscar-1-pnb-2018-47","title":"Colossal OSCAR 1 [pnb; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_pnb</code> Title: Colossal OSCAR 1 [pnb; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pnb/#colossal-oscar-1-pnb-2019-22","title":"Colossal OSCAR 1 [pnb; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_pnb</code> Title: Colossal OSCAR 1 [pnb; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pnb/#colossal-oscar-1-pnb-2020-24","title":"Colossal OSCAR 1 [pnb; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_pnb</code> Title: Colossal OSCAR 1 [pnb; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pnb/#colossal-oscar-1-pnb-2020-45","title":"Colossal OSCAR 1 [pnb; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_pnb</code> Title: Colossal OSCAR 1 [pnb; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pnb/#colossal-oscar-1-pnb-2021-49","title":"Colossal OSCAR 1 [pnb; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_pnb</code> Title: Colossal OSCAR 1 [pnb; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pnb/#colossal-oscar-1-pnb-2022-27","title":"Colossal OSCAR 1 [pnb; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_pnb</code> Title: Colossal OSCAR 1 [pnb; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pnb/#colossal-oscar-1-pnb-2022-49","title":"Colossal OSCAR 1 [pnb; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_pnb</code> Title: Colossal OSCAR 1 [pnb; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pnb/#colossal-oscar-1-pnb-2023-14","title":"Colossal OSCAR 1 [pnb; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_pnb</code> Title: Colossal OSCAR 1 [pnb; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pnb/#colossal-oscar-1-pnb-2023-23","title":"Colossal OSCAR 1 [pnb; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_pnb</code> Title: Colossal OSCAR 1 [pnb; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ps/","title":"Pashto Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Pashto language.</p>"},{"location":"datasets/language_ps/#colossal-oscar-1-ps-2015-14","title":"Colossal OSCAR 1 [ps; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ps</code> Title: Colossal OSCAR 1 [ps; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ps/#colossal-oscar-1-ps-2016-40","title":"Colossal OSCAR 1 [ps; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ps</code> Title: Colossal OSCAR 1 [ps; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ps/#colossal-oscar-1-ps-2017-43","title":"Colossal OSCAR 1 [ps; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ps</code> Title: Colossal OSCAR 1 [ps; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ps/#colossal-oscar-1-ps-2018-47","title":"Colossal OSCAR 1 [ps; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ps</code> Title: Colossal OSCAR 1 [ps; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ps/#colossal-oscar-1-ps-2019-22","title":"Colossal OSCAR 1 [ps; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ps</code> Title: Colossal OSCAR 1 [ps; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ps/#colossal-oscar-1-ps-2020-24","title":"Colossal OSCAR 1 [ps; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ps</code> Title: Colossal OSCAR 1 [ps; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ps/#colossal-oscar-1-ps-2020-45","title":"Colossal OSCAR 1 [ps; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ps</code> Title: Colossal OSCAR 1 [ps; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ps/#colossal-oscar-1-ps-2021-49","title":"Colossal OSCAR 1 [ps; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ps</code> Title: Colossal OSCAR 1 [ps; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ps/#colossal-oscar-1-ps-2022-27","title":"Colossal OSCAR 1 [ps; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ps</code> Title: Colossal OSCAR 1 [ps; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ps/#colossal-oscar-1-ps-2022-49","title":"Colossal OSCAR 1 [ps; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ps</code> Title: Colossal OSCAR 1 [ps; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ps/#colossal-oscar-1-ps-2023-14","title":"Colossal OSCAR 1 [ps; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ps</code> Title: Colossal OSCAR 1 [ps; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ps/#colossal-oscar-1-ps-2023-23","title":"Colossal OSCAR 1 [ps; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ps</code> Title: Colossal OSCAR 1 [ps; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_pt/","title":"Portuguese Datasets","text":"<p>There are in total 23 datasets with 24 B tokens in Portuguese language.</p>"},{"location":"datasets/language_pt/#brazilian-portuguese-web-as-corpus","title":"Brazilian Portuguese Web as Corpus","text":"Dataset ID: <code>brwac</code> Title: Brazilian Portuguese Web as Corpus Description: The BrWaC (Brazilian Portuguese Web as Corpus) is a large corpus constructed followingthe Wacky framework, which was made public for research purposes.The current corpus version, released in January 2017, is composed by 3.53 million documents,2.68 billion tokens and 5.79 million types. Please note that this resource is availablesolely for academic research purposes, and you agreed not to use it for any commercial applications. Availibility: <code>on_request</code> Homepage: [https://huggingface.co/datasets/brwac] License: research-only (commercial use: False, sharealike: None) Tokens: 3 B"},{"location":"datasets/language_pt/#colossal-oscar-1-pt-2015-14","title":"Colossal OSCAR 1 [pt; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_pt</code> Title: Colossal OSCAR 1 [pt; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pt/#colossal-oscar-1-pt-2016-40","title":"Colossal OSCAR 1 [pt; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_pt</code> Title: Colossal OSCAR 1 [pt; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pt/#colossal-oscar-1-pt-2017-43","title":"Colossal OSCAR 1 [pt; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_pt</code> Title: Colossal OSCAR 1 [pt; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pt/#colossal-oscar-1-pt-2018-47","title":"Colossal OSCAR 1 [pt; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_pt</code> Title: Colossal OSCAR 1 [pt; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pt/#colossal-oscar-1-pt-2019-22","title":"Colossal OSCAR 1 [pt; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_pt</code> Title: Colossal OSCAR 1 [pt; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pt/#colossal-oscar-1-pt-2020-24","title":"Colossal OSCAR 1 [pt; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_pt</code> Title: Colossal OSCAR 1 [pt; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pt/#colossal-oscar-1-pt-2020-45","title":"Colossal OSCAR 1 [pt; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_pt</code> Title: Colossal OSCAR 1 [pt; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pt/#colossal-oscar-1-pt-2021-49","title":"Colossal OSCAR 1 [pt; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_pt</code> Title: Colossal OSCAR 1 [pt; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pt/#colossal-oscar-1-pt-2022-27","title":"Colossal OSCAR 1 [pt; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_pt</code> Title: Colossal OSCAR 1 [pt; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pt/#colossal-oscar-1-pt-2022-49","title":"Colossal OSCAR 1 [pt; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_pt</code> Title: Colossal OSCAR 1 [pt; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pt/#colossal-oscar-1-pt-2023-14","title":"Colossal OSCAR 1 [pt; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_pt</code> Title: Colossal OSCAR 1 [pt; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pt/#colossal-oscar-1-pt-2023-23","title":"Colossal OSCAR 1 [pt; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_pt</code> Title: Colossal OSCAR 1 [pt; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 12 B"},{"location":"datasets/language_pt/#eurlexresources-pt","title":"EurlexResources [pt]","text":"Dataset ID: <code>eurlex_pt</code> Title: EurlexResources [pt] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 7 B"},{"location":"datasets/language_pt/#legalmc4-pt","title":"LegalMC4 [pt]","text":"Dataset ID: <code>legal_mc4_pt</code> Title: LegalMC4 [pt] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 1 B"},{"location":"datasets/language_pt/#parlamentopt","title":"ParlamentoPT","text":"Dataset ID: <code>parlamento_pt</code> Title: ParlamentoPT Description: The ParlamentoPT is a Portuguese language data set obtained by collecting publicly available documents containing transcriptions of debates in the Portuguese Parliament. The data was collected from the Portuguese Parliament portal in accordance with its open data policy. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/PORTULAN/parlamento-pt] License: open data (Portuguese Parliament portal policy) (commercial use: None, sharealike: None) Tokens: 819 M"},{"location":"datasets/language_pt/#wura-portuguese","title":"WURA [Portuguese]","text":"Dataset ID: <code>wura_pt</code> Title: WURA [Portuguese] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_pt/#wikibooks-pt","title":"Wikibooks [pt]","text":"Dataset ID: <code>wikibooks_pt</code> Title: Wikibooks [pt] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 13 M"},{"location":"datasets/language_pt/#wikinews-pt","title":"Wikinews [pt]","text":"Dataset ID: <code>wikinews_pt</code> Title: Wikinews [pt] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 16 M"},{"location":"datasets/language_pt/#wikipedia-pt","title":"Wikipedia [pt]","text":"Dataset ID: <code>wiki_pt</code> Title: Wikipedia [pt] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 466 M"},{"location":"datasets/language_pt/#wikiquote-pt","title":"Wikiquote [pt]","text":"Dataset ID: <code>wikiquote_pt</code> Title: Wikiquote [pt] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 7 M"},{"location":"datasets/language_pt/#wikisource-pt","title":"Wikisource [pt]","text":"Dataset ID: <code>wikisource_pt</code> Title: Wikisource [pt] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 35 M"},{"location":"datasets/language_pt/#wikivoyage-pt","title":"Wikivoyage [pt]","text":"Dataset ID: <code>wikivoyage_pt</code> Title: Wikivoyage [pt] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 3 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_qu/","title":"Quechua Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Quechua language.</p>"},{"location":"datasets/language_qu/#colossal-oscar-1-qu-2015-14","title":"Colossal OSCAR 1 [qu; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_qu</code> Title: Colossal OSCAR 1 [qu; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_qu/#colossal-oscar-1-qu-2016-40","title":"Colossal OSCAR 1 [qu; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_qu</code> Title: Colossal OSCAR 1 [qu; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_qu/#colossal-oscar-1-qu-2017-43","title":"Colossal OSCAR 1 [qu; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_qu</code> Title: Colossal OSCAR 1 [qu; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_qu/#colossal-oscar-1-qu-2018-47","title":"Colossal OSCAR 1 [qu; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_qu</code> Title: Colossal OSCAR 1 [qu; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_qu/#colossal-oscar-1-qu-2019-22","title":"Colossal OSCAR 1 [qu; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_qu</code> Title: Colossal OSCAR 1 [qu; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_qu/#colossal-oscar-1-qu-2020-24","title":"Colossal OSCAR 1 [qu; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_qu</code> Title: Colossal OSCAR 1 [qu; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_qu/#colossal-oscar-1-qu-2020-45","title":"Colossal OSCAR 1 [qu; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_qu</code> Title: Colossal OSCAR 1 [qu; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_qu/#colossal-oscar-1-qu-2021-49","title":"Colossal OSCAR 1 [qu; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_qu</code> Title: Colossal OSCAR 1 [qu; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_qu/#colossal-oscar-1-qu-2022-27","title":"Colossal OSCAR 1 [qu; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_qu</code> Title: Colossal OSCAR 1 [qu; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_qu/#colossal-oscar-1-qu-2022-49","title":"Colossal OSCAR 1 [qu; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_qu</code> Title: Colossal OSCAR 1 [qu; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_qu/#colossal-oscar-1-qu-2023-14","title":"Colossal OSCAR 1 [qu; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_qu</code> Title: Colossal OSCAR 1 [qu; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_qu/#colossal-oscar-1-qu-2023-23","title":"Colossal OSCAR 1 [qu; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_qu</code> Title: Colossal OSCAR 1 [qu; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ro/","title":"Romanian Datasets","text":"<p>There are in total 22 datasets with 9 B tokens in Romanian language.</p>"},{"location":"datasets/language_ro/#curlicat-corpus-romanian","title":"CURLICAT Corpus [Romanian]","text":"Dataset ID: <code>curlicat_ro</code> Title: CURLICAT Corpus [Romanian] Description: The CURLICAT corpus includes 7 monolingual corpora (Bulgarian, Croatian, Hungarian, Polish, Romanian, Slovak and Slovenian) containing selected samples from respective national corpora. Availibility: <code>direct_download</code> Homepage: [https://elrc-share.eu/repository/browse/curlicat-romanian-corpus/8b6c8dca58ea11ed9c1a00155d026706fb03ef8b4c1847cfbe9cea869a82731e/] License: CC-BY-SA-4.0 (commercial use: None, sharealike: True) Tokens: 95 M"},{"location":"datasets/language_ro/#colossal-oscar-1-ro-2015-14","title":"Colossal OSCAR 1 [ro; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ro</code> Title: Colossal OSCAR 1 [ro; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ro/#colossal-oscar-1-ro-2016-40","title":"Colossal OSCAR 1 [ro; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ro</code> Title: Colossal OSCAR 1 [ro; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ro/#colossal-oscar-1-ro-2017-43","title":"Colossal OSCAR 1 [ro; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ro</code> Title: Colossal OSCAR 1 [ro; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ro/#colossal-oscar-1-ro-2018-47","title":"Colossal OSCAR 1 [ro; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ro</code> Title: Colossal OSCAR 1 [ro; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ro/#colossal-oscar-1-ro-2019-22","title":"Colossal OSCAR 1 [ro; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ro</code> Title: Colossal OSCAR 1 [ro; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ro/#colossal-oscar-1-ro-2020-24","title":"Colossal OSCAR 1 [ro; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ro</code> Title: Colossal OSCAR 1 [ro; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ro/#colossal-oscar-1-ro-2020-45","title":"Colossal OSCAR 1 [ro; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ro</code> Title: Colossal OSCAR 1 [ro; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ro/#colossal-oscar-1-ro-2021-49","title":"Colossal OSCAR 1 [ro; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ro</code> Title: Colossal OSCAR 1 [ro; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ro/#colossal-oscar-1-ro-2022-27","title":"Colossal OSCAR 1 [ro; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ro</code> Title: Colossal OSCAR 1 [ro; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ro/#colossal-oscar-1-ro-2022-49","title":"Colossal OSCAR 1 [ro; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ro</code> Title: Colossal OSCAR 1 [ro; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ro/#colossal-oscar-1-ro-2023-14","title":"Colossal OSCAR 1 [ro; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ro</code> Title: Colossal OSCAR 1 [ro; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ro/#colossal-oscar-1-ro-2023-23","title":"Colossal OSCAR 1 [ro; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ro</code> Title: Colossal OSCAR 1 [ro; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 4 B"},{"location":"datasets/language_ro/#eurlexresources-ro","title":"EurlexResources [ro]","text":"Dataset ID: <code>eurlex_ro</code> Title: EurlexResources [ro] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 4 B"},{"location":"datasets/language_ro/#legalmc4-ro","title":"LegalMC4 [ro]","text":"Dataset ID: <code>legal_mc4_ro</code> Title: LegalMC4 [ro] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 551 M"},{"location":"datasets/language_ro/#marcell-romanian-legislative-subcorpus-v2","title":"MARCELL Romanian legislative subcorpus v2","text":"Dataset ID: <code>marcell_legislative_subcorpus_v2</code> Title: MARCELL Romanian legislative subcorpus v2 Description: The Romanian corpus contains 163,274 files, which represent the body of national legislation ranging from 1881 to 2021. This corpus includes mainly: governmental decisions, ministerial orders, decisions, decrees and laws. All the texts were obtained via crawling from the public Romanian legislative portal. This corpus resulted from the MARCELL project. Alternate download location: https://relate.racai.ro/marcell/new/ Availibility: <code>direct_download</code> Homepage: [https://elrc-share.eu/repository/browse/marcell-romanian-legislative-subcorpus-v2/2da548428b9d11eb9c1a00155d026706ce94a6b59ffc4b0e9fb5cd9cebe6889e/] License: public domain Tokens: 31 M"},{"location":"datasets/language_ro/#wikibooks-ro","title":"Wikibooks [ro]","text":"Dataset ID: <code>wikibooks_ro</code> Title: Wikibooks [ro] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 1 M"},{"location":"datasets/language_ro/#wikinews-ro","title":"Wikinews [ro]","text":"Dataset ID: <code>wikinews_ro</code> Title: Wikinews [ro] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 744 k"},{"location":"datasets/language_ro/#wikipedia-ro","title":"Wikipedia [ro]","text":"Dataset ID: <code>wiki_ro</code> Title: Wikipedia [ro] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 152 M"},{"location":"datasets/language_ro/#wikiquote-ro","title":"Wikiquote [ro]","text":"Dataset ID: <code>wikiquote_ro</code> Title: Wikiquote [ro] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 436 k"},{"location":"datasets/language_ro/#wikisource-ro","title":"Wikisource [ro]","text":"Dataset ID: <code>wikisource_ro</code> Title: Wikisource [ro] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 49 M"},{"location":"datasets/language_ro/#wikivoyage-ro","title":"Wikivoyage [ro]","text":"Dataset ID: <code>wikivoyage_ro</code> Title: Wikivoyage [ro] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 507 k <p>This page is automatically generated.</p>"},{"location":"datasets/language_ru/","title":"Russian Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Russian language.</p>"},{"location":"datasets/language_ru/#colossal-oscar-1-ru-2015-14","title":"Colossal OSCAR 1 [ru; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ru</code> Title: Colossal OSCAR 1 [ru; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ru/#colossal-oscar-1-ru-2016-40","title":"Colossal OSCAR 1 [ru; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ru</code> Title: Colossal OSCAR 1 [ru; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ru/#colossal-oscar-1-ru-2017-43","title":"Colossal OSCAR 1 [ru; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ru</code> Title: Colossal OSCAR 1 [ru; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ru/#colossal-oscar-1-ru-2018-47","title":"Colossal OSCAR 1 [ru; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ru</code> Title: Colossal OSCAR 1 [ru; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ru/#colossal-oscar-1-ru-2019-22","title":"Colossal OSCAR 1 [ru; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ru</code> Title: Colossal OSCAR 1 [ru; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ru/#colossal-oscar-1-ru-2020-24","title":"Colossal OSCAR 1 [ru; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ru</code> Title: Colossal OSCAR 1 [ru; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ru/#colossal-oscar-1-ru-2020-45","title":"Colossal OSCAR 1 [ru; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ru</code> Title: Colossal OSCAR 1 [ru; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ru/#colossal-oscar-1-ru-2021-49","title":"Colossal OSCAR 1 [ru; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ru</code> Title: Colossal OSCAR 1 [ru; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ru/#colossal-oscar-1-ru-2022-27","title":"Colossal OSCAR 1 [ru; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ru</code> Title: Colossal OSCAR 1 [ru; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ru/#colossal-oscar-1-ru-2022-49","title":"Colossal OSCAR 1 [ru; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ru</code> Title: Colossal OSCAR 1 [ru; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ru/#colossal-oscar-1-ru-2023-14","title":"Colossal OSCAR 1 [ru; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ru</code> Title: Colossal OSCAR 1 [ru; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ru/#colossal-oscar-1-ru-2023-23","title":"Colossal OSCAR 1 [ru; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ru</code> Title: Colossal OSCAR 1 [ru; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_rw/","title":"Kinyarwanda Datasets","text":"<p>There are in total 1 datasets with N/A tokens in Kinyarwanda language.</p>"},{"location":"datasets/language_rw/#wura-kinyarwanda","title":"WURA [Kinyarwanda]","text":"Dataset ID: <code>wura_rw</code> Title: WURA [Kinyarwanda] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_sa/","title":"Sanskrit Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Sanskrit language.</p>"},{"location":"datasets/language_sa/#colossal-oscar-1-sa-2015-14","title":"Colossal OSCAR 1 [sa; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_sa</code> Title: Colossal OSCAR 1 [sa; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sa/#colossal-oscar-1-sa-2016-40","title":"Colossal OSCAR 1 [sa; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_sa</code> Title: Colossal OSCAR 1 [sa; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sa/#colossal-oscar-1-sa-2017-43","title":"Colossal OSCAR 1 [sa; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_sa</code> Title: Colossal OSCAR 1 [sa; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sa/#colossal-oscar-1-sa-2018-47","title":"Colossal OSCAR 1 [sa; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_sa</code> Title: Colossal OSCAR 1 [sa; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sa/#colossal-oscar-1-sa-2019-22","title":"Colossal OSCAR 1 [sa; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_sa</code> Title: Colossal OSCAR 1 [sa; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sa/#colossal-oscar-1-sa-2020-24","title":"Colossal OSCAR 1 [sa; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_sa</code> Title: Colossal OSCAR 1 [sa; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sa/#colossal-oscar-1-sa-2020-45","title":"Colossal OSCAR 1 [sa; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_sa</code> Title: Colossal OSCAR 1 [sa; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sa/#colossal-oscar-1-sa-2021-49","title":"Colossal OSCAR 1 [sa; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_sa</code> Title: Colossal OSCAR 1 [sa; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sa/#colossal-oscar-1-sa-2022-27","title":"Colossal OSCAR 1 [sa; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_sa</code> Title: Colossal OSCAR 1 [sa; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sa/#colossal-oscar-1-sa-2022-49","title":"Colossal OSCAR 1 [sa; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_sa</code> Title: Colossal OSCAR 1 [sa; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sa/#colossal-oscar-1-sa-2023-14","title":"Colossal OSCAR 1 [sa; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_sa</code> Title: Colossal OSCAR 1 [sa; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sa/#colossal-oscar-1-sa-2023-23","title":"Colossal OSCAR 1 [sa; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_sa</code> Title: Colossal OSCAR 1 [sa; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_sah/","title":"Sah Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Sah language.</p>"},{"location":"datasets/language_sah/#colossal-oscar-1-sah-2015-14","title":"Colossal OSCAR 1 [sah; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_sah</code> Title: Colossal OSCAR 1 [sah; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sah/#colossal-oscar-1-sah-2016-40","title":"Colossal OSCAR 1 [sah; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_sah</code> Title: Colossal OSCAR 1 [sah; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sah/#colossal-oscar-1-sah-2017-43","title":"Colossal OSCAR 1 [sah; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_sah</code> Title: Colossal OSCAR 1 [sah; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sah/#colossal-oscar-1-sah-2018-47","title":"Colossal OSCAR 1 [sah; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_sah</code> Title: Colossal OSCAR 1 [sah; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sah/#colossal-oscar-1-sah-2019-22","title":"Colossal OSCAR 1 [sah; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_sah</code> Title: Colossal OSCAR 1 [sah; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sah/#colossal-oscar-1-sah-2020-24","title":"Colossal OSCAR 1 [sah; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_sah</code> Title: Colossal OSCAR 1 [sah; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sah/#colossal-oscar-1-sah-2020-45","title":"Colossal OSCAR 1 [sah; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_sah</code> Title: Colossal OSCAR 1 [sah; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sah/#colossal-oscar-1-sah-2021-49","title":"Colossal OSCAR 1 [sah; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_sah</code> Title: Colossal OSCAR 1 [sah; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sah/#colossal-oscar-1-sah-2022-27","title":"Colossal OSCAR 1 [sah; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_sah</code> Title: Colossal OSCAR 1 [sah; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sah/#colossal-oscar-1-sah-2022-49","title":"Colossal OSCAR 1 [sah; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_sah</code> Title: Colossal OSCAR 1 [sah; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sah/#colossal-oscar-1-sah-2023-14","title":"Colossal OSCAR 1 [sah; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_sah</code> Title: Colossal OSCAR 1 [sah; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sah/#colossal-oscar-1-sah-2023-23","title":"Colossal OSCAR 1 [sah; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_sah</code> Title: Colossal OSCAR 1 [sah; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_sd/","title":"Sindhi Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Sindhi language.</p>"},{"location":"datasets/language_sd/#colossal-oscar-1-sd-2015-14","title":"Colossal OSCAR 1 [sd; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_sd</code> Title: Colossal OSCAR 1 [sd; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sd/#colossal-oscar-1-sd-2016-40","title":"Colossal OSCAR 1 [sd; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_sd</code> Title: Colossal OSCAR 1 [sd; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sd/#colossal-oscar-1-sd-2017-43","title":"Colossal OSCAR 1 [sd; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_sd</code> Title: Colossal OSCAR 1 [sd; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sd/#colossal-oscar-1-sd-2018-47","title":"Colossal OSCAR 1 [sd; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_sd</code> Title: Colossal OSCAR 1 [sd; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sd/#colossal-oscar-1-sd-2019-22","title":"Colossal OSCAR 1 [sd; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_sd</code> Title: Colossal OSCAR 1 [sd; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sd/#colossal-oscar-1-sd-2020-24","title":"Colossal OSCAR 1 [sd; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_sd</code> Title: Colossal OSCAR 1 [sd; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sd/#colossal-oscar-1-sd-2020-45","title":"Colossal OSCAR 1 [sd; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_sd</code> Title: Colossal OSCAR 1 [sd; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sd/#colossal-oscar-1-sd-2021-49","title":"Colossal OSCAR 1 [sd; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_sd</code> Title: Colossal OSCAR 1 [sd; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sd/#colossal-oscar-1-sd-2022-27","title":"Colossal OSCAR 1 [sd; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_sd</code> Title: Colossal OSCAR 1 [sd; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sd/#colossal-oscar-1-sd-2022-49","title":"Colossal OSCAR 1 [sd; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_sd</code> Title: Colossal OSCAR 1 [sd; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sd/#colossal-oscar-1-sd-2023-14","title":"Colossal OSCAR 1 [sd; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_sd</code> Title: Colossal OSCAR 1 [sd; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sd/#colossal-oscar-1-sd-2023-23","title":"Colossal OSCAR 1 [sd; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_sd</code> Title: Colossal OSCAR 1 [sd; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_sh/","title":"Serbo-Croatian Datasets","text":"<p>There are in total 13 datasets with 58 k tokens in Serbo-Croatian language.</p>"},{"location":"datasets/language_sh/#colossal-oscar-1-sh-2015-14","title":"Colossal OSCAR 1 [sh; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_sh</code> Title: Colossal OSCAR 1 [sh; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sh/#colossal-oscar-1-sh-2016-40","title":"Colossal OSCAR 1 [sh; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_sh</code> Title: Colossal OSCAR 1 [sh; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sh/#colossal-oscar-1-sh-2017-43","title":"Colossal OSCAR 1 [sh; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_sh</code> Title: Colossal OSCAR 1 [sh; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sh/#colossal-oscar-1-sh-2018-47","title":"Colossal OSCAR 1 [sh; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_sh</code> Title: Colossal OSCAR 1 [sh; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sh/#colossal-oscar-1-sh-2019-22","title":"Colossal OSCAR 1 [sh; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_sh</code> Title: Colossal OSCAR 1 [sh; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sh/#colossal-oscar-1-sh-2020-24","title":"Colossal OSCAR 1 [sh; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_sh</code> Title: Colossal OSCAR 1 [sh; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sh/#colossal-oscar-1-sh-2020-45","title":"Colossal OSCAR 1 [sh; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_sh</code> Title: Colossal OSCAR 1 [sh; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sh/#colossal-oscar-1-sh-2021-49","title":"Colossal OSCAR 1 [sh; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_sh</code> Title: Colossal OSCAR 1 [sh; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sh/#colossal-oscar-1-sh-2022-27","title":"Colossal OSCAR 1 [sh; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_sh</code> Title: Colossal OSCAR 1 [sh; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sh/#colossal-oscar-1-sh-2022-49","title":"Colossal OSCAR 1 [sh; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_sh</code> Title: Colossal OSCAR 1 [sh; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sh/#colossal-oscar-1-sh-2023-14","title":"Colossal OSCAR 1 [sh; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_sh</code> Title: Colossal OSCAR 1 [sh; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sh/#colossal-oscar-1-sh-2023-23","title":"Colossal OSCAR 1 [sh; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_sh</code> Title: Colossal OSCAR 1 [sh; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 58 k"},{"location":"datasets/language_sh/#wikipedia-sh","title":"Wikipedia [sh]","text":"Dataset ID: <code>wiki_sh</code> Title: Wikipedia [sh] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_si/","title":"Sinhalese Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Sinhalese language.</p>"},{"location":"datasets/language_si/#colossal-oscar-1-si-2015-14","title":"Colossal OSCAR 1 [si; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_si</code> Title: Colossal OSCAR 1 [si; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_si/#colossal-oscar-1-si-2016-40","title":"Colossal OSCAR 1 [si; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_si</code> Title: Colossal OSCAR 1 [si; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_si/#colossal-oscar-1-si-2017-43","title":"Colossal OSCAR 1 [si; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_si</code> Title: Colossal OSCAR 1 [si; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_si/#colossal-oscar-1-si-2018-47","title":"Colossal OSCAR 1 [si; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_si</code> Title: Colossal OSCAR 1 [si; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_si/#colossal-oscar-1-si-2019-22","title":"Colossal OSCAR 1 [si; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_si</code> Title: Colossal OSCAR 1 [si; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_si/#colossal-oscar-1-si-2020-24","title":"Colossal OSCAR 1 [si; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_si</code> Title: Colossal OSCAR 1 [si; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_si/#colossal-oscar-1-si-2020-45","title":"Colossal OSCAR 1 [si; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_si</code> Title: Colossal OSCAR 1 [si; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_si/#colossal-oscar-1-si-2021-49","title":"Colossal OSCAR 1 [si; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_si</code> Title: Colossal OSCAR 1 [si; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_si/#colossal-oscar-1-si-2022-27","title":"Colossal OSCAR 1 [si; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_si</code> Title: Colossal OSCAR 1 [si; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_si/#colossal-oscar-1-si-2022-49","title":"Colossal OSCAR 1 [si; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_si</code> Title: Colossal OSCAR 1 [si; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_si/#colossal-oscar-1-si-2023-14","title":"Colossal OSCAR 1 [si; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_si</code> Title: Colossal OSCAR 1 [si; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_si/#colossal-oscar-1-si-2023-23","title":"Colossal OSCAR 1 [si; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_si</code> Title: Colossal OSCAR 1 [si; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_sk/","title":"Slovak Datasets","text":"<p>There are in total 21 datasets with 18 B tokens in Slovak language.</p>"},{"location":"datasets/language_sk/#curlicat-corpus-slovak-3rd-version","title":"CURLICAT Corpus [Slovak 3rd version]","text":"Dataset ID: <code>curlicat_sk</code> Title: CURLICAT Corpus [Slovak 3rd version] Description: The CURLICAT corpus includes 7 monolingual corpora (Bulgarian, Croatian, Hungarian, Polish, Romanian, Slovak and Slovenian) containing selected samples from respective national corpora. Availibility: <code>direct_download</code> Homepage: [https://elrc-share.eu/repository/browse/curlicat-slovak-corpus-v10/b419d7086ef411ed9c1a00155d0267066a930aa487824c08ba48f1183e993aca/] License: unknown (commercial use: None, sharealike: None) Tokens: 67 M"},{"location":"datasets/language_sk/#colossal-oscar-1-sk-2015-14","title":"Colossal OSCAR 1 [sk; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_sk</code> Title: Colossal OSCAR 1 [sk; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sk/#colossal-oscar-1-sk-2016-40","title":"Colossal OSCAR 1 [sk; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_sk</code> Title: Colossal OSCAR 1 [sk; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sk/#colossal-oscar-1-sk-2017-43","title":"Colossal OSCAR 1 [sk; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_sk</code> Title: Colossal OSCAR 1 [sk; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sk/#colossal-oscar-1-sk-2018-47","title":"Colossal OSCAR 1 [sk; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_sk</code> Title: Colossal OSCAR 1 [sk; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sk/#colossal-oscar-1-sk-2019-22","title":"Colossal OSCAR 1 [sk; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_sk</code> Title: Colossal OSCAR 1 [sk; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sk/#colossal-oscar-1-sk-2020-24","title":"Colossal OSCAR 1 [sk; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_sk</code> Title: Colossal OSCAR 1 [sk; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sk/#colossal-oscar-1-sk-2020-45","title":"Colossal OSCAR 1 [sk; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_sk</code> Title: Colossal OSCAR 1 [sk; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sk/#colossal-oscar-1-sk-2021-49","title":"Colossal OSCAR 1 [sk; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_sk</code> Title: Colossal OSCAR 1 [sk; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sk/#colossal-oscar-1-sk-2022-27","title":"Colossal OSCAR 1 [sk; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_sk</code> Title: Colossal OSCAR 1 [sk; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sk/#colossal-oscar-1-sk-2022-49","title":"Colossal OSCAR 1 [sk; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_sk</code> Title: Colossal OSCAR 1 [sk; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sk/#colossal-oscar-1-sk-2023-14","title":"Colossal OSCAR 1 [sk; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_sk</code> Title: Colossal OSCAR 1 [sk; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sk/#colossal-oscar-1-sk-2023-23","title":"Colossal OSCAR 1 [sk; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_sk</code> Title: Colossal OSCAR 1 [sk; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 2 B"},{"location":"datasets/language_sk/#eurlexresources-sk","title":"EurlexResources [sk]","text":"Dataset ID: <code>eurlex_sk</code> Title: EurlexResources [sk] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 4 B"},{"location":"datasets/language_sk/#korpus-slovenskych-pravnych-predpisov-v19","title":"Korpus slovensk\u00fdch pr\u00e1vnych predpisov v1.9","text":"Dataset ID: <code>sk_laws</code> Title: Korpus slovensk\u00fdch pr\u00e1vnych predpisov v1.9 Description: Slovak body of laws (1955-2022) Availibility: <code>Yes - it has a direct download link or links</code> Homepage: [None] License: public domain (commercial use: None, sharealike: None) Tokens: 45 M"},{"location":"datasets/language_sk/#legalmc4-sk","title":"LegalMC4 [sk]","text":"Dataset ID: <code>legal_mc4_sk</code> Title: LegalMC4 [sk] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 349 M"},{"location":"datasets/language_sk/#wikibooks-sk","title":"Wikibooks [sk]","text":"Dataset ID: <code>wikibooks_sk</code> Title: Wikibooks [sk] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 2 M"},{"location":"datasets/language_sk/#wikipedia-sk","title":"Wikipedia [sk]","text":"Dataset ID: <code>wiki_sk</code> Title: Wikipedia [sk] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 64 M"},{"location":"datasets/language_sk/#wikiquote-sk","title":"Wikiquote [sk]","text":"Dataset ID: <code>wikiquote_sk</code> Title: Wikiquote [sk] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 1 M"},{"location":"datasets/language_sk/#wikisource-sk","title":"Wikisource [sk]","text":"Dataset ID: <code>wikisource_sk</code> Title: Wikisource [sk] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 1 M"},{"location":"datasets/language_sk/#od-justice-20","title":"od-justice 2.0","text":"Dataset ID: <code>sk_court_decisions</code> Title: od-justice 2.0 Description: Slovak court decisions. The corpus is based on data made available by the Ministry of Justice of the Slovak Republic. Availibility: <code>direct_download</code> Homepage: [https://www.juls.savba.sk/justicecorp.html] License: open data (commercial use: None, sharealike: None) Tokens: 11 B <p>This page is automatically generated.</p>"},{"location":"datasets/language_sl/","title":"Slovene Datasets","text":"<p>There are in total 23 datasets with 9 B tokens in Slovene language.</p>"},{"location":"datasets/language_sl/#curlicat-corpus-slovenian","title":"CURLICAT Corpus [Slovenian]","text":"Dataset ID: <code>curlicat_sl</code> Title: CURLICAT Corpus [Slovenian] Description: The CURLICAT corpus includes 7 monolingual corpora (Bulgarian, Croatian, Hungarian, Polish, Romanian, Slovak and Slovenian) containing selected samples from respective national corpora. Availibility: <code>direct_download</code> Homepage: [https://elrc-share.eu/repository/browse/curlicat-slovenian-corpus/e549f298590711ed9c1a00155d026706db0d61d46f294d9a821307cf9c5df245/] License: CC-BY-SA-4.0 (commercial use: None, sharealike: True) Tokens: 43 M"},{"location":"datasets/language_sl/#colossal-oscar-1-sl-2015-14","title":"Colossal OSCAR 1 [sl; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_sl</code> Title: Colossal OSCAR 1 [sl; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sl/#colossal-oscar-1-sl-2016-40","title":"Colossal OSCAR 1 [sl; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_sl</code> Title: Colossal OSCAR 1 [sl; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sl/#colossal-oscar-1-sl-2017-43","title":"Colossal OSCAR 1 [sl; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_sl</code> Title: Colossal OSCAR 1 [sl; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sl/#colossal-oscar-1-sl-2018-47","title":"Colossal OSCAR 1 [sl; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_sl</code> Title: Colossal OSCAR 1 [sl; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sl/#colossal-oscar-1-sl-2019-22","title":"Colossal OSCAR 1 [sl; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_sl</code> Title: Colossal OSCAR 1 [sl; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sl/#colossal-oscar-1-sl-2020-24","title":"Colossal OSCAR 1 [sl; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_sl</code> Title: Colossal OSCAR 1 [sl; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sl/#colossal-oscar-1-sl-2020-45","title":"Colossal OSCAR 1 [sl; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_sl</code> Title: Colossal OSCAR 1 [sl; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sl/#colossal-oscar-1-sl-2021-49","title":"Colossal OSCAR 1 [sl; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_sl</code> Title: Colossal OSCAR 1 [sl; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sl/#colossal-oscar-1-sl-2022-27","title":"Colossal OSCAR 1 [sl; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_sl</code> Title: Colossal OSCAR 1 [sl; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sl/#colossal-oscar-1-sl-2022-49","title":"Colossal OSCAR 1 [sl; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_sl</code> Title: Colossal OSCAR 1 [sl; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sl/#colossal-oscar-1-sl-2023-14","title":"Colossal OSCAR 1 [sl; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_sl</code> Title: Colossal OSCAR 1 [sl; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sl/#colossal-oscar-1-sl-2023-23","title":"Colossal OSCAR 1 [sl; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_sl</code> Title: Colossal OSCAR 1 [sl; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 181 M"},{"location":"datasets/language_sl/#corpus-of-academic-slovene-kas-20","title":"Corpus of academic Slovene KAS 2.0","text":"Dataset ID: <code>academic_slovene_kas</code> Title: Corpus of academic Slovene KAS 2.0 Description: The KAS corpus of Slovene academic writing consists of almost 65,000 BSc/BA, 16,000 MSc/MA and 1,600 PhD theses (82 thousand texts, 5 million pages or 1,5 billion tokens) written 2000 - 2018 and gathered from the digital libraries of Slovene higher education institutions via the Slovene Open Science portal (http://openscience.si/). Availibility: <code>direct_download</code> Homepage: [https://www.clarin.si/repository/xmlui/handle/11356/1448] License: CLARIN.SI Licence ACA ID-BY-NC-INF-NORED 1.0 (commercial use: False, sharealike: None) Tokens: 1 B"},{"location":"datasets/language_sl/#eurlexresources-sl","title":"EurlexResources [sl]","text":"Dataset ID: <code>eurlex_sl</code> Title: EurlexResources [sl] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 4 B"},{"location":"datasets/language_sl/#legalmc4-sl","title":"LegalMC4 [sl]","text":"Dataset ID: <code>legal_mc4_sl</code> Title: LegalMC4 [sl] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 107 M"},{"location":"datasets/language_sl/#macocu-web-corpus-slovene-20","title":"MaCoCu web corpus [Slovene 2.0]","text":"Dataset ID: <code>macocu_sl</code> Title: MaCoCu web corpus [Slovene 2.0] Description: MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/ Availibility: <code>direct_download</code> Homepage: [https://www.clarin.si/repository/xmlui/handle/11356/1795] License: CC0-No Rights Reserved (commercial use: True, sharealike: False) Tokens: 2 B"},{"location":"datasets/language_sl/#wikibooks-sl","title":"Wikibooks [sl]","text":"Dataset ID: <code>wikibooks_sl</code> Title: Wikibooks [sl] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 2 M"},{"location":"datasets/language_sl/#wikipedia-sl","title":"Wikipedia [sl]","text":"Dataset ID: <code>wiki_sl</code> Title: Wikipedia [sl] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 77 M"},{"location":"datasets/language_sl/#wikiquote-sl","title":"Wikiquote [sl]","text":"Dataset ID: <code>wikiquote_sl</code> Title: Wikiquote [sl] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 669 k"},{"location":"datasets/language_sl/#wikisource-sl","title":"Wikisource [sl]","text":"Dataset ID: <code>wikisource_sl</code> Title: Wikisource [sl] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 118 M"},{"location":"datasets/language_sl/#written-corpus-ccgigafida-10","title":"Written corpus ccGigafida 1.0","text":"Dataset ID: <code>cc_gigafida</code> Title: Written corpus ccGigafida 1.0 Description: Corpus ccGigafida consists of paragraph samples from 31,722 documents, each containing information about the source (e.g. newspapers, magazines), year of publication, text type (fiction, newspaper), the title and author if they are known. Availibility: <code>direct_download</code> Homepage: [https://www.clarin.si/repository/xmlui/handle/11356/1035] License: Creative Commons - Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0) (commercial use: False, sharealike: True) Tokens: 127 M"},{"location":"datasets/language_sl/#slwac-web-corpus","title":"slWaC web corpus","text":"Dataset ID: <code>slwac_web</code> Title: slWaC web corpus Description: slWaC is a web corpus collected from the .si top-level domain in 2011 and 2014. The corpus is tokenized and annotated with the lemma and the morphosyntax layer. Availibility: <code>None</code> Homepage: [http://nlp.ffzg.hr/resources/corpora/slwac/] License: open license Tokens: 1 B <p>This page is automatically generated.</p>"},{"location":"datasets/language_sn/","title":"Shona Datasets","text":"<p>There are in total 1 datasets with N/A tokens in Shona language.</p>"},{"location":"datasets/language_sn/#wura-shona","title":"WURA [Shona]","text":"Dataset ID: <code>wura_sn</code> Title: WURA [Shona] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_so/","title":"Somali Datasets","text":"<p>There are in total 13 datasets with N/A tokens in Somali language.</p>"},{"location":"datasets/language_so/#colossal-oscar-1-so-2015-14","title":"Colossal OSCAR 1 [so; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_so</code> Title: Colossal OSCAR 1 [so; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_so/#colossal-oscar-1-so-2016-40","title":"Colossal OSCAR 1 [so; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_so</code> Title: Colossal OSCAR 1 [so; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_so/#colossal-oscar-1-so-2017-43","title":"Colossal OSCAR 1 [so; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_so</code> Title: Colossal OSCAR 1 [so; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_so/#colossal-oscar-1-so-2018-47","title":"Colossal OSCAR 1 [so; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_so</code> Title: Colossal OSCAR 1 [so; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_so/#colossal-oscar-1-so-2019-22","title":"Colossal OSCAR 1 [so; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_so</code> Title: Colossal OSCAR 1 [so; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_so/#colossal-oscar-1-so-2020-24","title":"Colossal OSCAR 1 [so; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_so</code> Title: Colossal OSCAR 1 [so; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_so/#colossal-oscar-1-so-2020-45","title":"Colossal OSCAR 1 [so; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_so</code> Title: Colossal OSCAR 1 [so; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_so/#colossal-oscar-1-so-2021-49","title":"Colossal OSCAR 1 [so; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_so</code> Title: Colossal OSCAR 1 [so; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_so/#colossal-oscar-1-so-2022-27","title":"Colossal OSCAR 1 [so; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_so</code> Title: Colossal OSCAR 1 [so; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_so/#colossal-oscar-1-so-2022-49","title":"Colossal OSCAR 1 [so; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_so</code> Title: Colossal OSCAR 1 [so; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_so/#colossal-oscar-1-so-2023-14","title":"Colossal OSCAR 1 [so; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_so</code> Title: Colossal OSCAR 1 [so; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_so/#colossal-oscar-1-so-2023-23","title":"Colossal OSCAR 1 [so; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_so</code> Title: Colossal OSCAR 1 [so; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_so/#wura-somali","title":"WURA [Somali]","text":"Dataset ID: <code>wura_so</code> Title: WURA [Somali] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_sq/","title":"Albanian Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Albanian language.</p>"},{"location":"datasets/language_sq/#colossal-oscar-1-sq-2015-14","title":"Colossal OSCAR 1 [sq; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_sq</code> Title: Colossal OSCAR 1 [sq; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sq/#colossal-oscar-1-sq-2016-40","title":"Colossal OSCAR 1 [sq; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_sq</code> Title: Colossal OSCAR 1 [sq; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sq/#colossal-oscar-1-sq-2017-43","title":"Colossal OSCAR 1 [sq; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_sq</code> Title: Colossal OSCAR 1 [sq; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sq/#colossal-oscar-1-sq-2018-47","title":"Colossal OSCAR 1 [sq; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_sq</code> Title: Colossal OSCAR 1 [sq; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sq/#colossal-oscar-1-sq-2019-22","title":"Colossal OSCAR 1 [sq; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_sq</code> Title: Colossal OSCAR 1 [sq; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sq/#colossal-oscar-1-sq-2020-24","title":"Colossal OSCAR 1 [sq; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_sq</code> Title: Colossal OSCAR 1 [sq; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sq/#colossal-oscar-1-sq-2020-45","title":"Colossal OSCAR 1 [sq; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_sq</code> Title: Colossal OSCAR 1 [sq; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sq/#colossal-oscar-1-sq-2021-49","title":"Colossal OSCAR 1 [sq; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_sq</code> Title: Colossal OSCAR 1 [sq; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sq/#colossal-oscar-1-sq-2022-27","title":"Colossal OSCAR 1 [sq; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_sq</code> Title: Colossal OSCAR 1 [sq; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sq/#colossal-oscar-1-sq-2022-49","title":"Colossal OSCAR 1 [sq; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_sq</code> Title: Colossal OSCAR 1 [sq; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sq/#colossal-oscar-1-sq-2023-14","title":"Colossal OSCAR 1 [sq; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_sq</code> Title: Colossal OSCAR 1 [sq; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sq/#colossal-oscar-1-sq-2023-23","title":"Colossal OSCAR 1 [sq; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_sq</code> Title: Colossal OSCAR 1 [sq; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_sr/","title":"Serbian Datasets","text":"<p>There are in total 19 datasets with 3 B tokens in Serbian language.</p>"},{"location":"datasets/language_sr/#colossal-oscar-1-sr-2015-14","title":"Colossal OSCAR 1 [sr; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_sr</code> Title: Colossal OSCAR 1 [sr; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#colossal-oscar-1-sr-2016-40","title":"Colossal OSCAR 1 [sr; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_sr</code> Title: Colossal OSCAR 1 [sr; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#colossal-oscar-1-sr-2017-43","title":"Colossal OSCAR 1 [sr; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_sr</code> Title: Colossal OSCAR 1 [sr; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#colossal-oscar-1-sr-2018-47","title":"Colossal OSCAR 1 [sr; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_sr</code> Title: Colossal OSCAR 1 [sr; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#colossal-oscar-1-sr-2019-22","title":"Colossal OSCAR 1 [sr; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_sr</code> Title: Colossal OSCAR 1 [sr; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#colossal-oscar-1-sr-2020-24","title":"Colossal OSCAR 1 [sr; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_sr</code> Title: Colossal OSCAR 1 [sr; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#colossal-oscar-1-sr-2020-45","title":"Colossal OSCAR 1 [sr; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_sr</code> Title: Colossal OSCAR 1 [sr; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#colossal-oscar-1-sr-2021-49","title":"Colossal OSCAR 1 [sr; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_sr</code> Title: Colossal OSCAR 1 [sr; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#colossal-oscar-1-sr-2022-27","title":"Colossal OSCAR 1 [sr; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_sr</code> Title: Colossal OSCAR 1 [sr; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#colossal-oscar-1-sr-2022-49","title":"Colossal OSCAR 1 [sr; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_sr</code> Title: Colossal OSCAR 1 [sr; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#colossal-oscar-1-sr-2023-14","title":"Colossal OSCAR 1 [sr; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_sr</code> Title: Colossal OSCAR 1 [sr; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#colossal-oscar-1-sr-2023-23","title":"Colossal OSCAR 1 [sr; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_sr</code> Title: Colossal OSCAR 1 [sr; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 652 M"},{"location":"datasets/language_sr/#macocu-web-corpus-serbian-10","title":"MaCoCu web corpus [Serbian 1.0]","text":"Dataset ID: <code>macocu_sr</code> Title: MaCoCu web corpus [Serbian 1.0] Description: MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/ Availibility: <code>direct_download</code> Homepage: [https://www.clarin.si/repository/xmlui/handle/11356/1807] License: CC0-No Rights Reserved (commercial use: True, sharealike: False) Tokens: 2 B"},{"location":"datasets/language_sr/#srpkorsubset-news-legal-academic-conversation-literary","title":"SrpKorSubset (news, legal, academic, conversation, literary)","text":"Dataset ID: <code>srpkor</code> Title: SrpKorSubset (news, legal, academic, conversation, literary) Description: The Corpus of contemporary Serbian, SrpKor, consists of 4,925 texts. Availibility: <code>on_request</code> Homepage: [http://www.korpus.matf.bg.ac.rs/] License: Do not redistribute, DFKI has permission to use it for pre-training LLMs (commercial use: None, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#wikibooks-sr","title":"Wikibooks [sr]","text":"Dataset ID: <code>wikibooks_sr</code> Title: Wikibooks [sr] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#wikinews-sr","title":"Wikinews [sr]","text":"Dataset ID: <code>wikinews_sr</code> Title: Wikinews [sr] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#wikipedia-sr","title":"Wikipedia [sr]","text":"Dataset ID: <code>wiki_sr</code> Title: Wikipedia [sr] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#wikiquote-sr","title":"Wikiquote [sr]","text":"Dataset ID: <code>wikiquote_sr</code> Title: Wikiquote [sr] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sr/#wikisource-sr","title":"Wikisource [sr]","text":"Dataset ID: <code>wikisource_sr</code> Title: Wikisource [sr] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_st/","title":"Southern Sotho Datasets","text":"<p>There are in total 1 datasets with N/A tokens in Southern Sotho language.</p>"},{"location":"datasets/language_st/#wura-southern-sotho","title":"WURA [Southern Sotho]","text":"Dataset ID: <code>wura_st</code> Title: WURA [Southern Sotho] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_su/","title":"Sundanese Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Sundanese language.</p>"},{"location":"datasets/language_su/#colossal-oscar-1-su-2015-14","title":"Colossal OSCAR 1 [su; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_su</code> Title: Colossal OSCAR 1 [su; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_su/#colossal-oscar-1-su-2016-40","title":"Colossal OSCAR 1 [su; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_su</code> Title: Colossal OSCAR 1 [su; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_su/#colossal-oscar-1-su-2017-43","title":"Colossal OSCAR 1 [su; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_su</code> Title: Colossal OSCAR 1 [su; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_su/#colossal-oscar-1-su-2018-47","title":"Colossal OSCAR 1 [su; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_su</code> Title: Colossal OSCAR 1 [su; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_su/#colossal-oscar-1-su-2019-22","title":"Colossal OSCAR 1 [su; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_su</code> Title: Colossal OSCAR 1 [su; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_su/#colossal-oscar-1-su-2020-24","title":"Colossal OSCAR 1 [su; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_su</code> Title: Colossal OSCAR 1 [su; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_su/#colossal-oscar-1-su-2020-45","title":"Colossal OSCAR 1 [su; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_su</code> Title: Colossal OSCAR 1 [su; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_su/#colossal-oscar-1-su-2021-49","title":"Colossal OSCAR 1 [su; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_su</code> Title: Colossal OSCAR 1 [su; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_su/#colossal-oscar-1-su-2022-27","title":"Colossal OSCAR 1 [su; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_su</code> Title: Colossal OSCAR 1 [su; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_su/#colossal-oscar-1-su-2022-49","title":"Colossal OSCAR 1 [su; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_su</code> Title: Colossal OSCAR 1 [su; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_su/#colossal-oscar-1-su-2023-14","title":"Colossal OSCAR 1 [su; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_su</code> Title: Colossal OSCAR 1 [su; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_su/#colossal-oscar-1-su-2023-23","title":"Colossal OSCAR 1 [su; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_su</code> Title: Colossal OSCAR 1 [su; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_sv/","title":"Swedish Datasets","text":"<p>There are in total 21 datasets with 13 B tokens in Swedish language.</p>"},{"location":"datasets/language_sv/#colossal-oscar-1-sv-2015-14","title":"Colossal OSCAR 1 [sv; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_sv</code> Title: Colossal OSCAR 1 [sv; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sv/#colossal-oscar-1-sv-2016-40","title":"Colossal OSCAR 1 [sv; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_sv</code> Title: Colossal OSCAR 1 [sv; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sv/#colossal-oscar-1-sv-2017-43","title":"Colossal OSCAR 1 [sv; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_sv</code> Title: Colossal OSCAR 1 [sv; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sv/#colossal-oscar-1-sv-2018-47","title":"Colossal OSCAR 1 [sv; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_sv</code> Title: Colossal OSCAR 1 [sv; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sv/#colossal-oscar-1-sv-2019-22","title":"Colossal OSCAR 1 [sv; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_sv</code> Title: Colossal OSCAR 1 [sv; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sv/#colossal-oscar-1-sv-2020-24","title":"Colossal OSCAR 1 [sv; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_sv</code> Title: Colossal OSCAR 1 [sv; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sv/#colossal-oscar-1-sv-2020-45","title":"Colossal OSCAR 1 [sv; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_sv</code> Title: Colossal OSCAR 1 [sv; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sv/#colossal-oscar-1-sv-2021-49","title":"Colossal OSCAR 1 [sv; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_sv</code> Title: Colossal OSCAR 1 [sv; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sv/#colossal-oscar-1-sv-2022-27","title":"Colossal OSCAR 1 [sv; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_sv</code> Title: Colossal OSCAR 1 [sv; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sv/#colossal-oscar-1-sv-2022-49","title":"Colossal OSCAR 1 [sv; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_sv</code> Title: Colossal OSCAR 1 [sv; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sv/#colossal-oscar-1-sv-2023-14","title":"Colossal OSCAR 1 [sv; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_sv</code> Title: Colossal OSCAR 1 [sv; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sv/#colossal-oscar-1-sv-2023-23","title":"Colossal OSCAR 1 [sv; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_sv</code> Title: Colossal OSCAR 1 [sv; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 6 B"},{"location":"datasets/language_sv/#eurlexresources-sv","title":"EurlexResources [sv]","text":"Dataset ID: <code>eurlex_sv</code> Title: EurlexResources [sv] Description: A Corpus Covering the Largest EURLEX Resources. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/eurlex_resources] License: Creative Commons Attribution 4.0 International licence (commercial use: True, sharealike: False) Tokens: 5 B"},{"location":"datasets/language_sv/#legalmc4-sv","title":"LegalMC4 [sv]","text":"Dataset ID: <code>legal_mc4_sv</code> Title: LegalMC4 [sv] Description: MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/joelito/legal-mc4] License: AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset. (commercial use: True, sharealike: None) Tokens: 328 M"},{"location":"datasets/language_sv/#the-swedish-culturomics-gigaword-corpus","title":"The Swedish Culturomics Gigaword Corpus","text":"Dataset ID: <code>sv_gigaword</code> Title: The Swedish Culturomics Gigaword Corpus Description: One billion Swedish words from 1950 and onwards. Code to extract data from the corpus, as well as usage instructions, can be downloaded from https://svn.spraakdata.gu.se/sb-arkiv/tools/gigaword/ Availibility: <code>direct_download</code> Homepage: [https://spraakbanken.gu.se/en/resources/gigaword] License: BY-SA 4.0 (commercial use: True, sharealike: False) Tokens: 1 B"},{"location":"datasets/language_sv/#wikibooks-sv","title":"Wikibooks [sv]","text":"Dataset ID: <code>wikibooks_sv</code> Title: Wikibooks [sv] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 3 M"},{"location":"datasets/language_sv/#wikinews-sv","title":"Wikinews [sv]","text":"Dataset ID: <code>wikinews_sv</code> Title: Wikinews [sv] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 540 k"},{"location":"datasets/language_sv/#wikipedia-sv","title":"Wikipedia [sv]","text":"Dataset ID: <code>wiki_sv</code> Title: Wikipedia [sv] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 130 M"},{"location":"datasets/language_sv/#wikiquote-sv","title":"Wikiquote [sv]","text":"Dataset ID: <code>wikiquote_sv</code> Title: Wikiquote [sv] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 381 k"},{"location":"datasets/language_sv/#wikisource-sv","title":"Wikisource [sv]","text":"Dataset ID: <code>wikisource_sv</code> Title: Wikisource [sv] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 9 M"},{"location":"datasets/language_sv/#wikivoyage-sv","title":"Wikivoyage [sv]","text":"Dataset ID: <code>wikivoyage_sv</code> Title: Wikivoyage [sv] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: 1 M <p>This page is automatically generated.</p>"},{"location":"datasets/language_sw/","title":"Swahili Datasets","text":"<p>There are in total 13 datasets with N/A tokens in Swahili language.</p>"},{"location":"datasets/language_sw/#colossal-oscar-1-sw-2015-14","title":"Colossal OSCAR 1 [sw; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_sw</code> Title: Colossal OSCAR 1 [sw; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sw/#colossal-oscar-1-sw-2016-40","title":"Colossal OSCAR 1 [sw; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_sw</code> Title: Colossal OSCAR 1 [sw; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sw/#colossal-oscar-1-sw-2017-43","title":"Colossal OSCAR 1 [sw; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_sw</code> Title: Colossal OSCAR 1 [sw; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sw/#colossal-oscar-1-sw-2018-47","title":"Colossal OSCAR 1 [sw; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_sw</code> Title: Colossal OSCAR 1 [sw; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sw/#colossal-oscar-1-sw-2019-22","title":"Colossal OSCAR 1 [sw; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_sw</code> Title: Colossal OSCAR 1 [sw; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sw/#colossal-oscar-1-sw-2020-24","title":"Colossal OSCAR 1 [sw; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_sw</code> Title: Colossal OSCAR 1 [sw; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sw/#colossal-oscar-1-sw-2020-45","title":"Colossal OSCAR 1 [sw; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_sw</code> Title: Colossal OSCAR 1 [sw; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sw/#colossal-oscar-1-sw-2021-49","title":"Colossal OSCAR 1 [sw; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_sw</code> Title: Colossal OSCAR 1 [sw; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sw/#colossal-oscar-1-sw-2022-27","title":"Colossal OSCAR 1 [sw; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_sw</code> Title: Colossal OSCAR 1 [sw; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sw/#colossal-oscar-1-sw-2022-49","title":"Colossal OSCAR 1 [sw; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_sw</code> Title: Colossal OSCAR 1 [sw; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sw/#colossal-oscar-1-sw-2023-14","title":"Colossal OSCAR 1 [sw; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_sw</code> Title: Colossal OSCAR 1 [sw; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sw/#colossal-oscar-1-sw-2023-23","title":"Colossal OSCAR 1 [sw; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_sw</code> Title: Colossal OSCAR 1 [sw; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_sw/#wura-swahili","title":"WURA [Swahili]","text":"Dataset ID: <code>wura_sw</code> Title: WURA [Swahili] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ta/","title":"Tamil Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Tamil language.</p>"},{"location":"datasets/language_ta/#colossal-oscar-1-ta-2015-14","title":"Colossal OSCAR 1 [ta; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ta</code> Title: Colossal OSCAR 1 [ta; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ta/#colossal-oscar-1-ta-2016-40","title":"Colossal OSCAR 1 [ta; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ta</code> Title: Colossal OSCAR 1 [ta; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ta/#colossal-oscar-1-ta-2017-43","title":"Colossal OSCAR 1 [ta; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ta</code> Title: Colossal OSCAR 1 [ta; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ta/#colossal-oscar-1-ta-2018-47","title":"Colossal OSCAR 1 [ta; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ta</code> Title: Colossal OSCAR 1 [ta; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ta/#colossal-oscar-1-ta-2019-22","title":"Colossal OSCAR 1 [ta; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ta</code> Title: Colossal OSCAR 1 [ta; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ta/#colossal-oscar-1-ta-2020-24","title":"Colossal OSCAR 1 [ta; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ta</code> Title: Colossal OSCAR 1 [ta; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ta/#colossal-oscar-1-ta-2020-45","title":"Colossal OSCAR 1 [ta; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ta</code> Title: Colossal OSCAR 1 [ta; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ta/#colossal-oscar-1-ta-2021-49","title":"Colossal OSCAR 1 [ta; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ta</code> Title: Colossal OSCAR 1 [ta; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ta/#colossal-oscar-1-ta-2022-27","title":"Colossal OSCAR 1 [ta; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ta</code> Title: Colossal OSCAR 1 [ta; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ta/#colossal-oscar-1-ta-2022-49","title":"Colossal OSCAR 1 [ta; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ta</code> Title: Colossal OSCAR 1 [ta; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ta/#colossal-oscar-1-ta-2023-14","title":"Colossal OSCAR 1 [ta; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ta</code> Title: Colossal OSCAR 1 [ta; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ta/#colossal-oscar-1-ta-2023-23","title":"Colossal OSCAR 1 [ta; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ta</code> Title: Colossal OSCAR 1 [ta; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_te/","title":"Telugu Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Telugu language.</p>"},{"location":"datasets/language_te/#colossal-oscar-1-te-2015-14","title":"Colossal OSCAR 1 [te; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_te</code> Title: Colossal OSCAR 1 [te; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_te/#colossal-oscar-1-te-2016-40","title":"Colossal OSCAR 1 [te; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_te</code> Title: Colossal OSCAR 1 [te; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_te/#colossal-oscar-1-te-2017-43","title":"Colossal OSCAR 1 [te; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_te</code> Title: Colossal OSCAR 1 [te; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_te/#colossal-oscar-1-te-2018-47","title":"Colossal OSCAR 1 [te; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_te</code> Title: Colossal OSCAR 1 [te; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_te/#colossal-oscar-1-te-2019-22","title":"Colossal OSCAR 1 [te; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_te</code> Title: Colossal OSCAR 1 [te; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_te/#colossal-oscar-1-te-2020-24","title":"Colossal OSCAR 1 [te; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_te</code> Title: Colossal OSCAR 1 [te; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_te/#colossal-oscar-1-te-2020-45","title":"Colossal OSCAR 1 [te; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_te</code> Title: Colossal OSCAR 1 [te; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_te/#colossal-oscar-1-te-2021-49","title":"Colossal OSCAR 1 [te; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_te</code> Title: Colossal OSCAR 1 [te; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_te/#colossal-oscar-1-te-2022-27","title":"Colossal OSCAR 1 [te; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_te</code> Title: Colossal OSCAR 1 [te; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_te/#colossal-oscar-1-te-2022-49","title":"Colossal OSCAR 1 [te; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_te</code> Title: Colossal OSCAR 1 [te; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_te/#colossal-oscar-1-te-2023-14","title":"Colossal OSCAR 1 [te; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_te</code> Title: Colossal OSCAR 1 [te; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_te/#colossal-oscar-1-te-2023-23","title":"Colossal OSCAR 1 [te; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_te</code> Title: Colossal OSCAR 1 [te; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_tg/","title":"Tajik Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Tajik language.</p>"},{"location":"datasets/language_tg/#colossal-oscar-1-tg-2015-14","title":"Colossal OSCAR 1 [tg; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_tg</code> Title: Colossal OSCAR 1 [tg; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tg/#colossal-oscar-1-tg-2016-40","title":"Colossal OSCAR 1 [tg; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_tg</code> Title: Colossal OSCAR 1 [tg; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tg/#colossal-oscar-1-tg-2017-43","title":"Colossal OSCAR 1 [tg; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_tg</code> Title: Colossal OSCAR 1 [tg; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tg/#colossal-oscar-1-tg-2018-47","title":"Colossal OSCAR 1 [tg; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_tg</code> Title: Colossal OSCAR 1 [tg; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tg/#colossal-oscar-1-tg-2019-22","title":"Colossal OSCAR 1 [tg; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_tg</code> Title: Colossal OSCAR 1 [tg; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tg/#colossal-oscar-1-tg-2020-24","title":"Colossal OSCAR 1 [tg; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_tg</code> Title: Colossal OSCAR 1 [tg; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tg/#colossal-oscar-1-tg-2020-45","title":"Colossal OSCAR 1 [tg; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_tg</code> Title: Colossal OSCAR 1 [tg; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tg/#colossal-oscar-1-tg-2021-49","title":"Colossal OSCAR 1 [tg; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_tg</code> Title: Colossal OSCAR 1 [tg; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tg/#colossal-oscar-1-tg-2022-27","title":"Colossal OSCAR 1 [tg; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_tg</code> Title: Colossal OSCAR 1 [tg; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tg/#colossal-oscar-1-tg-2022-49","title":"Colossal OSCAR 1 [tg; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_tg</code> Title: Colossal OSCAR 1 [tg; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tg/#colossal-oscar-1-tg-2023-14","title":"Colossal OSCAR 1 [tg; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_tg</code> Title: Colossal OSCAR 1 [tg; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tg/#colossal-oscar-1-tg-2023-23","title":"Colossal OSCAR 1 [tg; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_tg</code> Title: Colossal OSCAR 1 [tg; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_th/","title":"Thai Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Thai language.</p>"},{"location":"datasets/language_th/#colossal-oscar-1-th-2015-14","title":"Colossal OSCAR 1 [th; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_th</code> Title: Colossal OSCAR 1 [th; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_th/#colossal-oscar-1-th-2016-40","title":"Colossal OSCAR 1 [th; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_th</code> Title: Colossal OSCAR 1 [th; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_th/#colossal-oscar-1-th-2017-43","title":"Colossal OSCAR 1 [th; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_th</code> Title: Colossal OSCAR 1 [th; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_th/#colossal-oscar-1-th-2018-47","title":"Colossal OSCAR 1 [th; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_th</code> Title: Colossal OSCAR 1 [th; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_th/#colossal-oscar-1-th-2019-22","title":"Colossal OSCAR 1 [th; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_th</code> Title: Colossal OSCAR 1 [th; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_th/#colossal-oscar-1-th-2020-24","title":"Colossal OSCAR 1 [th; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_th</code> Title: Colossal OSCAR 1 [th; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_th/#colossal-oscar-1-th-2020-45","title":"Colossal OSCAR 1 [th; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_th</code> Title: Colossal OSCAR 1 [th; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_th/#colossal-oscar-1-th-2021-49","title":"Colossal OSCAR 1 [th; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_th</code> Title: Colossal OSCAR 1 [th; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_th/#colossal-oscar-1-th-2022-27","title":"Colossal OSCAR 1 [th; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_th</code> Title: Colossal OSCAR 1 [th; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_th/#colossal-oscar-1-th-2022-49","title":"Colossal OSCAR 1 [th; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_th</code> Title: Colossal OSCAR 1 [th; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_th/#colossal-oscar-1-th-2023-14","title":"Colossal OSCAR 1 [th; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_th</code> Title: Colossal OSCAR 1 [th; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_th/#colossal-oscar-1-th-2023-23","title":"Colossal OSCAR 1 [th; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_th</code> Title: Colossal OSCAR 1 [th; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ti/","title":"Tigrinya Datasets","text":"<p>There are in total 1 datasets with N/A tokens in Tigrinya language.</p>"},{"location":"datasets/language_ti/#wura-tigrinya","title":"WURA [Tigrinya]","text":"Dataset ID: <code>wura_ti</code> Title: WURA [Tigrinya] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_tk/","title":"Turkmen Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Turkmen language.</p>"},{"location":"datasets/language_tk/#colossal-oscar-1-tk-2015-14","title":"Colossal OSCAR 1 [tk; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_tk</code> Title: Colossal OSCAR 1 [tk; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tk/#colossal-oscar-1-tk-2016-40","title":"Colossal OSCAR 1 [tk; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_tk</code> Title: Colossal OSCAR 1 [tk; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tk/#colossal-oscar-1-tk-2017-43","title":"Colossal OSCAR 1 [tk; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_tk</code> Title: Colossal OSCAR 1 [tk; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tk/#colossal-oscar-1-tk-2018-47","title":"Colossal OSCAR 1 [tk; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_tk</code> Title: Colossal OSCAR 1 [tk; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tk/#colossal-oscar-1-tk-2019-22","title":"Colossal OSCAR 1 [tk; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_tk</code> Title: Colossal OSCAR 1 [tk; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tk/#colossal-oscar-1-tk-2020-24","title":"Colossal OSCAR 1 [tk; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_tk</code> Title: Colossal OSCAR 1 [tk; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tk/#colossal-oscar-1-tk-2020-45","title":"Colossal OSCAR 1 [tk; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_tk</code> Title: Colossal OSCAR 1 [tk; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tk/#colossal-oscar-1-tk-2021-49","title":"Colossal OSCAR 1 [tk; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_tk</code> Title: Colossal OSCAR 1 [tk; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tk/#colossal-oscar-1-tk-2022-27","title":"Colossal OSCAR 1 [tk; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_tk</code> Title: Colossal OSCAR 1 [tk; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tk/#colossal-oscar-1-tk-2022-49","title":"Colossal OSCAR 1 [tk; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_tk</code> Title: Colossal OSCAR 1 [tk; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tk/#colossal-oscar-1-tk-2023-14","title":"Colossal OSCAR 1 [tk; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_tk</code> Title: Colossal OSCAR 1 [tk; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tk/#colossal-oscar-1-tk-2023-23","title":"Colossal OSCAR 1 [tk; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_tk</code> Title: Colossal OSCAR 1 [tk; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_tl/","title":"Tagalog Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Tagalog language.</p>"},{"location":"datasets/language_tl/#colossal-oscar-1-tl-2015-14","title":"Colossal OSCAR 1 [tl; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_tl</code> Title: Colossal OSCAR 1 [tl; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tl/#colossal-oscar-1-tl-2016-40","title":"Colossal OSCAR 1 [tl; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_tl</code> Title: Colossal OSCAR 1 [tl; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tl/#colossal-oscar-1-tl-2017-43","title":"Colossal OSCAR 1 [tl; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_tl</code> Title: Colossal OSCAR 1 [tl; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tl/#colossal-oscar-1-tl-2018-47","title":"Colossal OSCAR 1 [tl; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_tl</code> Title: Colossal OSCAR 1 [tl; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tl/#colossal-oscar-1-tl-2019-22","title":"Colossal OSCAR 1 [tl; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_tl</code> Title: Colossal OSCAR 1 [tl; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tl/#colossal-oscar-1-tl-2020-24","title":"Colossal OSCAR 1 [tl; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_tl</code> Title: Colossal OSCAR 1 [tl; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tl/#colossal-oscar-1-tl-2020-45","title":"Colossal OSCAR 1 [tl; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_tl</code> Title: Colossal OSCAR 1 [tl; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tl/#colossal-oscar-1-tl-2021-49","title":"Colossal OSCAR 1 [tl; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_tl</code> Title: Colossal OSCAR 1 [tl; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tl/#colossal-oscar-1-tl-2022-27","title":"Colossal OSCAR 1 [tl; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_tl</code> Title: Colossal OSCAR 1 [tl; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tl/#colossal-oscar-1-tl-2022-49","title":"Colossal OSCAR 1 [tl; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_tl</code> Title: Colossal OSCAR 1 [tl; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tl/#colossal-oscar-1-tl-2023-14","title":"Colossal OSCAR 1 [tl; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_tl</code> Title: Colossal OSCAR 1 [tl; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tl/#colossal-oscar-1-tl-2023-23","title":"Colossal OSCAR 1 [tl; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_tl</code> Title: Colossal OSCAR 1 [tl; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_tr/","title":"Turkish Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Turkish language.</p>"},{"location":"datasets/language_tr/#colossal-oscar-1-tr-2015-14","title":"Colossal OSCAR 1 [tr; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_tr</code> Title: Colossal OSCAR 1 [tr; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tr/#colossal-oscar-1-tr-2016-40","title":"Colossal OSCAR 1 [tr; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_tr</code> Title: Colossal OSCAR 1 [tr; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tr/#colossal-oscar-1-tr-2017-43","title":"Colossal OSCAR 1 [tr; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_tr</code> Title: Colossal OSCAR 1 [tr; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tr/#colossal-oscar-1-tr-2018-47","title":"Colossal OSCAR 1 [tr; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_tr</code> Title: Colossal OSCAR 1 [tr; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tr/#colossal-oscar-1-tr-2019-22","title":"Colossal OSCAR 1 [tr; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_tr</code> Title: Colossal OSCAR 1 [tr; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tr/#colossal-oscar-1-tr-2020-24","title":"Colossal OSCAR 1 [tr; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_tr</code> Title: Colossal OSCAR 1 [tr; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tr/#colossal-oscar-1-tr-2020-45","title":"Colossal OSCAR 1 [tr; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_tr</code> Title: Colossal OSCAR 1 [tr; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tr/#colossal-oscar-1-tr-2021-49","title":"Colossal OSCAR 1 [tr; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_tr</code> Title: Colossal OSCAR 1 [tr; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tr/#colossal-oscar-1-tr-2022-27","title":"Colossal OSCAR 1 [tr; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_tr</code> Title: Colossal OSCAR 1 [tr; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tr/#colossal-oscar-1-tr-2022-49","title":"Colossal OSCAR 1 [tr; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_tr</code> Title: Colossal OSCAR 1 [tr; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tr/#colossal-oscar-1-tr-2023-14","title":"Colossal OSCAR 1 [tr; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_tr</code> Title: Colossal OSCAR 1 [tr; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tr/#colossal-oscar-1-tr-2023-23","title":"Colossal OSCAR 1 [tr; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_tr</code> Title: Colossal OSCAR 1 [tr; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_tt/","title":"Tatar Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Tatar language.</p>"},{"location":"datasets/language_tt/#colossal-oscar-1-tt-2015-14","title":"Colossal OSCAR 1 [tt; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_tt</code> Title: Colossal OSCAR 1 [tt; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tt/#colossal-oscar-1-tt-2016-40","title":"Colossal OSCAR 1 [tt; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_tt</code> Title: Colossal OSCAR 1 [tt; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tt/#colossal-oscar-1-tt-2017-43","title":"Colossal OSCAR 1 [tt; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_tt</code> Title: Colossal OSCAR 1 [tt; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tt/#colossal-oscar-1-tt-2018-47","title":"Colossal OSCAR 1 [tt; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_tt</code> Title: Colossal OSCAR 1 [tt; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tt/#colossal-oscar-1-tt-2019-22","title":"Colossal OSCAR 1 [tt; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_tt</code> Title: Colossal OSCAR 1 [tt; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tt/#colossal-oscar-1-tt-2020-24","title":"Colossal OSCAR 1 [tt; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_tt</code> Title: Colossal OSCAR 1 [tt; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tt/#colossal-oscar-1-tt-2020-45","title":"Colossal OSCAR 1 [tt; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_tt</code> Title: Colossal OSCAR 1 [tt; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tt/#colossal-oscar-1-tt-2021-49","title":"Colossal OSCAR 1 [tt; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_tt</code> Title: Colossal OSCAR 1 [tt; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tt/#colossal-oscar-1-tt-2022-27","title":"Colossal OSCAR 1 [tt; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_tt</code> Title: Colossal OSCAR 1 [tt; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tt/#colossal-oscar-1-tt-2022-49","title":"Colossal OSCAR 1 [tt; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_tt</code> Title: Colossal OSCAR 1 [tt; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tt/#colossal-oscar-1-tt-2023-14","title":"Colossal OSCAR 1 [tt; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_tt</code> Title: Colossal OSCAR 1 [tt; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_tt/#colossal-oscar-1-tt-2023-23","title":"Colossal OSCAR 1 [tt; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_tt</code> Title: Colossal OSCAR 1 [tt; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ug/","title":"Uighur Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Uighur language.</p>"},{"location":"datasets/language_ug/#colossal-oscar-1-ug-2015-14","title":"Colossal OSCAR 1 [ug; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ug</code> Title: Colossal OSCAR 1 [ug; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ug/#colossal-oscar-1-ug-2016-40","title":"Colossal OSCAR 1 [ug; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ug</code> Title: Colossal OSCAR 1 [ug; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ug/#colossal-oscar-1-ug-2017-43","title":"Colossal OSCAR 1 [ug; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ug</code> Title: Colossal OSCAR 1 [ug; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ug/#colossal-oscar-1-ug-2018-47","title":"Colossal OSCAR 1 [ug; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ug</code> Title: Colossal OSCAR 1 [ug; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ug/#colossal-oscar-1-ug-2019-22","title":"Colossal OSCAR 1 [ug; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ug</code> Title: Colossal OSCAR 1 [ug; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ug/#colossal-oscar-1-ug-2020-24","title":"Colossal OSCAR 1 [ug; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ug</code> Title: Colossal OSCAR 1 [ug; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ug/#colossal-oscar-1-ug-2020-45","title":"Colossal OSCAR 1 [ug; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ug</code> Title: Colossal OSCAR 1 [ug; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ug/#colossal-oscar-1-ug-2021-49","title":"Colossal OSCAR 1 [ug; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ug</code> Title: Colossal OSCAR 1 [ug; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ug/#colossal-oscar-1-ug-2022-27","title":"Colossal OSCAR 1 [ug; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ug</code> Title: Colossal OSCAR 1 [ug; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ug/#colossal-oscar-1-ug-2022-49","title":"Colossal OSCAR 1 [ug; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ug</code> Title: Colossal OSCAR 1 [ug; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ug/#colossal-oscar-1-ug-2023-14","title":"Colossal OSCAR 1 [ug; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ug</code> Title: Colossal OSCAR 1 [ug; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ug/#colossal-oscar-1-ug-2023-23","title":"Colossal OSCAR 1 [ug; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ug</code> Title: Colossal OSCAR 1 [ug; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_uk/","title":"Ukrainian Datasets","text":"<p>There are in total 20 datasets with 11 B tokens in Ukrainian language.</p>"},{"location":"datasets/language_uk/#colossal-oscar-1-uk-2015-14","title":"Colossal OSCAR 1 [uk; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_uk</code> Title: Colossal OSCAR 1 [uk; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#colossal-oscar-1-uk-2016-40","title":"Colossal OSCAR 1 [uk; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_uk</code> Title: Colossal OSCAR 1 [uk; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#colossal-oscar-1-uk-2017-43","title":"Colossal OSCAR 1 [uk; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_uk</code> Title: Colossal OSCAR 1 [uk; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#colossal-oscar-1-uk-2018-47","title":"Colossal OSCAR 1 [uk; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_uk</code> Title: Colossal OSCAR 1 [uk; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#colossal-oscar-1-uk-2019-22","title":"Colossal OSCAR 1 [uk; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_uk</code> Title: Colossal OSCAR 1 [uk; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#colossal-oscar-1-uk-2020-24","title":"Colossal OSCAR 1 [uk; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_uk</code> Title: Colossal OSCAR 1 [uk; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#colossal-oscar-1-uk-2020-45","title":"Colossal OSCAR 1 [uk; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_uk</code> Title: Colossal OSCAR 1 [uk; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#colossal-oscar-1-uk-2021-49","title":"Colossal OSCAR 1 [uk; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_uk</code> Title: Colossal OSCAR 1 [uk; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#colossal-oscar-1-uk-2022-27","title":"Colossal OSCAR 1 [uk; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_uk</code> Title: Colossal OSCAR 1 [uk; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#colossal-oscar-1-uk-2022-49","title":"Colossal OSCAR 1 [uk; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_uk</code> Title: Colossal OSCAR 1 [uk; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#colossal-oscar-1-uk-2023-14","title":"Colossal OSCAR 1 [uk; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_uk</code> Title: Colossal OSCAR 1 [uk; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#colossal-oscar-1-uk-2023-23","title":"Colossal OSCAR 1 [uk; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_uk</code> Title: Colossal OSCAR 1 [uk; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: 4 B"},{"location":"datasets/language_uk/#corpus-of-laws-and-legal-acts-of-ukraine","title":"Corpus of laws and legal acts of Ukraine","text":"Dataset ID: <code>uk_laws</code> Title: Corpus of laws and legal acts of Ukraine Description: A large (more than 9 Gb) corpus of laws and legal acts of Ukraine. Availibility: <code>direct_download</code> Homepage: [https://lang.org.ua/en/corpora/#anchor7] License: Unknown, likely public domain (commercial use: None, sharealike: None) Tokens: 579 M"},{"location":"datasets/language_uk/#macocu-web-corpus-ukrainian-10","title":"MaCoCu web corpus [Ukrainian 1.0]","text":"Dataset ID: <code>macocu_uk</code> Title: MaCoCu web corpus [Ukrainian 1.0] Description: MaCoCu focuses on collecting monolingual and parallel data from the Internet, specially for under-resourced languages and DSI-specific data. See https://macocu.eu/ Availibility: <code>direct_download</code> Homepage: [https://www.clarin.si/repository/xmlui/handle/11356/1838] License: CC0-No Rights Reserved (commercial use: True, sharealike: False) Tokens: 6 B"},{"location":"datasets/language_uk/#wikibooks-uk","title":"Wikibooks [uk]","text":"Dataset ID: <code>wikibooks_uk</code> Title: Wikibooks [uk] Description: The open-content textbooks collection that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikibooks.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#wikinews-uk","title":"Wikinews [uk]","text":"Dataset ID: <code>wikinews_uk</code> Title: Wikinews [uk] Description: News written by volunteers. Availibility: <code>direct_download</code> Homepage: [https://en.wikinews.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#wikipedia-uk","title":"Wikipedia [uk]","text":"Dataset ID: <code>wiki_uk</code> Title: Wikipedia [uk] Description: The free encyclopedia that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikipedia.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#wikiquote-uk","title":"Wikiquote [uk]","text":"Dataset ID: <code>wikiquote_uk</code> Title: Wikiquote [uk] Description: The free quote compendium that anyone can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikiquote.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#wikisource-uk","title":"Wikisource [uk]","text":"Dataset ID: <code>wikisource_uk</code> Title: Wikisource [uk] Description: The free library that anyone can improve. Availibility: <code>direct_download</code> Homepage: [https://en.wikisource.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uk/#wikivoyage-uk","title":"Wikivoyage [uk]","text":"Dataset ID: <code>wikivoyage_uk</code> Title: Wikivoyage [uk] Description: The free worldwide travel guide that you can edit. Availibility: <code>direct_download</code> Homepage: [https://en.wikivoyage.org/wiki/Main_Page] License: Creative Commons Attribution-ShareAlike 4.0 International License (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_ur/","title":"Urdu Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Urdu language.</p>"},{"location":"datasets/language_ur/#colossal-oscar-1-ur-2015-14","title":"Colossal OSCAR 1 [ur; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_ur</code> Title: Colossal OSCAR 1 [ur; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ur/#colossal-oscar-1-ur-2016-40","title":"Colossal OSCAR 1 [ur; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_ur</code> Title: Colossal OSCAR 1 [ur; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ur/#colossal-oscar-1-ur-2017-43","title":"Colossal OSCAR 1 [ur; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_ur</code> Title: Colossal OSCAR 1 [ur; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ur/#colossal-oscar-1-ur-2018-47","title":"Colossal OSCAR 1 [ur; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_ur</code> Title: Colossal OSCAR 1 [ur; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ur/#colossal-oscar-1-ur-2019-22","title":"Colossal OSCAR 1 [ur; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_ur</code> Title: Colossal OSCAR 1 [ur; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ur/#colossal-oscar-1-ur-2020-24","title":"Colossal OSCAR 1 [ur; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_ur</code> Title: Colossal OSCAR 1 [ur; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ur/#colossal-oscar-1-ur-2020-45","title":"Colossal OSCAR 1 [ur; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_ur</code> Title: Colossal OSCAR 1 [ur; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ur/#colossal-oscar-1-ur-2021-49","title":"Colossal OSCAR 1 [ur; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_ur</code> Title: Colossal OSCAR 1 [ur; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ur/#colossal-oscar-1-ur-2022-27","title":"Colossal OSCAR 1 [ur; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_ur</code> Title: Colossal OSCAR 1 [ur; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ur/#colossal-oscar-1-ur-2022-49","title":"Colossal OSCAR 1 [ur; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_ur</code> Title: Colossal OSCAR 1 [ur; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ur/#colossal-oscar-1-ur-2023-14","title":"Colossal OSCAR 1 [ur; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_ur</code> Title: Colossal OSCAR 1 [ur; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_ur/#colossal-oscar-1-ur-2023-23","title":"Colossal OSCAR 1 [ur; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_ur</code> Title: Colossal OSCAR 1 [ur; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_uz/","title":"Uzbek Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Uzbek language.</p>"},{"location":"datasets/language_uz/#colossal-oscar-1-uz-2015-14","title":"Colossal OSCAR 1 [uz; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_uz</code> Title: Colossal OSCAR 1 [uz; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uz/#colossal-oscar-1-uz-2016-40","title":"Colossal OSCAR 1 [uz; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_uz</code> Title: Colossal OSCAR 1 [uz; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uz/#colossal-oscar-1-uz-2017-43","title":"Colossal OSCAR 1 [uz; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_uz</code> Title: Colossal OSCAR 1 [uz; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uz/#colossal-oscar-1-uz-2018-47","title":"Colossal OSCAR 1 [uz; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_uz</code> Title: Colossal OSCAR 1 [uz; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uz/#colossal-oscar-1-uz-2019-22","title":"Colossal OSCAR 1 [uz; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_uz</code> Title: Colossal OSCAR 1 [uz; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uz/#colossal-oscar-1-uz-2020-24","title":"Colossal OSCAR 1 [uz; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_uz</code> Title: Colossal OSCAR 1 [uz; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uz/#colossal-oscar-1-uz-2020-45","title":"Colossal OSCAR 1 [uz; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_uz</code> Title: Colossal OSCAR 1 [uz; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uz/#colossal-oscar-1-uz-2021-49","title":"Colossal OSCAR 1 [uz; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_uz</code> Title: Colossal OSCAR 1 [uz; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uz/#colossal-oscar-1-uz-2022-27","title":"Colossal OSCAR 1 [uz; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_uz</code> Title: Colossal OSCAR 1 [uz; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uz/#colossal-oscar-1-uz-2022-49","title":"Colossal OSCAR 1 [uz; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_uz</code> Title: Colossal OSCAR 1 [uz; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uz/#colossal-oscar-1-uz-2023-14","title":"Colossal OSCAR 1 [uz; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_uz</code> Title: Colossal OSCAR 1 [uz; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_uz/#colossal-oscar-1-uz-2023-23","title":"Colossal OSCAR 1 [uz; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_uz</code> Title: Colossal OSCAR 1 [uz; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_vi/","title":"Vietnamese Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Vietnamese language.</p>"},{"location":"datasets/language_vi/#colossal-oscar-1-vi-2015-14","title":"Colossal OSCAR 1 [vi; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_vi</code> Title: Colossal OSCAR 1 [vi; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vi/#colossal-oscar-1-vi-2016-40","title":"Colossal OSCAR 1 [vi; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_vi</code> Title: Colossal OSCAR 1 [vi; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vi/#colossal-oscar-1-vi-2017-43","title":"Colossal OSCAR 1 [vi; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_vi</code> Title: Colossal OSCAR 1 [vi; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vi/#colossal-oscar-1-vi-2018-47","title":"Colossal OSCAR 1 [vi; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_vi</code> Title: Colossal OSCAR 1 [vi; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vi/#colossal-oscar-1-vi-2019-22","title":"Colossal OSCAR 1 [vi; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_vi</code> Title: Colossal OSCAR 1 [vi; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vi/#colossal-oscar-1-vi-2020-24","title":"Colossal OSCAR 1 [vi; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_vi</code> Title: Colossal OSCAR 1 [vi; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vi/#colossal-oscar-1-vi-2020-45","title":"Colossal OSCAR 1 [vi; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_vi</code> Title: Colossal OSCAR 1 [vi; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vi/#colossal-oscar-1-vi-2021-49","title":"Colossal OSCAR 1 [vi; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_vi</code> Title: Colossal OSCAR 1 [vi; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vi/#colossal-oscar-1-vi-2022-27","title":"Colossal OSCAR 1 [vi; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_vi</code> Title: Colossal OSCAR 1 [vi; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vi/#colossal-oscar-1-vi-2022-49","title":"Colossal OSCAR 1 [vi; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_vi</code> Title: Colossal OSCAR 1 [vi; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vi/#colossal-oscar-1-vi-2023-14","title":"Colossal OSCAR 1 [vi; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_vi</code> Title: Colossal OSCAR 1 [vi; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vi/#colossal-oscar-1-vi-2023-23","title":"Colossal OSCAR 1 [vi; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_vi</code> Title: Colossal OSCAR 1 [vi; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_vo/","title":"Volap\u00fck Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Volap\u00fck language.</p>"},{"location":"datasets/language_vo/#colossal-oscar-1-vo-2015-14","title":"Colossal OSCAR 1 [vo; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_vo</code> Title: Colossal OSCAR 1 [vo; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vo/#colossal-oscar-1-vo-2016-40","title":"Colossal OSCAR 1 [vo; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_vo</code> Title: Colossal OSCAR 1 [vo; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vo/#colossal-oscar-1-vo-2017-43","title":"Colossal OSCAR 1 [vo; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_vo</code> Title: Colossal OSCAR 1 [vo; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vo/#colossal-oscar-1-vo-2018-47","title":"Colossal OSCAR 1 [vo; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_vo</code> Title: Colossal OSCAR 1 [vo; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vo/#colossal-oscar-1-vo-2019-22","title":"Colossal OSCAR 1 [vo; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_vo</code> Title: Colossal OSCAR 1 [vo; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vo/#colossal-oscar-1-vo-2020-24","title":"Colossal OSCAR 1 [vo; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_vo</code> Title: Colossal OSCAR 1 [vo; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vo/#colossal-oscar-1-vo-2020-45","title":"Colossal OSCAR 1 [vo; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_vo</code> Title: Colossal OSCAR 1 [vo; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vo/#colossal-oscar-1-vo-2021-49","title":"Colossal OSCAR 1 [vo; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_vo</code> Title: Colossal OSCAR 1 [vo; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vo/#colossal-oscar-1-vo-2022-27","title":"Colossal OSCAR 1 [vo; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_vo</code> Title: Colossal OSCAR 1 [vo; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vo/#colossal-oscar-1-vo-2022-49","title":"Colossal OSCAR 1 [vo; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_vo</code> Title: Colossal OSCAR 1 [vo; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vo/#colossal-oscar-1-vo-2023-14","title":"Colossal OSCAR 1 [vo; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_vo</code> Title: Colossal OSCAR 1 [vo; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_vo/#colossal-oscar-1-vo-2023-23","title":"Colossal OSCAR 1 [vo; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_vo</code> Title: Colossal OSCAR 1 [vo; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_wa/","title":"Walloon Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Walloon language.</p>"},{"location":"datasets/language_wa/#colossal-oscar-1-wa-2015-14","title":"Colossal OSCAR 1 [wa; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_wa</code> Title: Colossal OSCAR 1 [wa; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wa/#colossal-oscar-1-wa-2016-40","title":"Colossal OSCAR 1 [wa; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_wa</code> Title: Colossal OSCAR 1 [wa; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wa/#colossal-oscar-1-wa-2017-43","title":"Colossal OSCAR 1 [wa; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_wa</code> Title: Colossal OSCAR 1 [wa; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wa/#colossal-oscar-1-wa-2018-47","title":"Colossal OSCAR 1 [wa; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_wa</code> Title: Colossal OSCAR 1 [wa; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wa/#colossal-oscar-1-wa-2019-22","title":"Colossal OSCAR 1 [wa; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_wa</code> Title: Colossal OSCAR 1 [wa; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wa/#colossal-oscar-1-wa-2020-24","title":"Colossal OSCAR 1 [wa; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_wa</code> Title: Colossal OSCAR 1 [wa; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wa/#colossal-oscar-1-wa-2020-45","title":"Colossal OSCAR 1 [wa; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_wa</code> Title: Colossal OSCAR 1 [wa; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wa/#colossal-oscar-1-wa-2021-49","title":"Colossal OSCAR 1 [wa; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_wa</code> Title: Colossal OSCAR 1 [wa; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wa/#colossal-oscar-1-wa-2022-27","title":"Colossal OSCAR 1 [wa; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_wa</code> Title: Colossal OSCAR 1 [wa; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wa/#colossal-oscar-1-wa-2022-49","title":"Colossal OSCAR 1 [wa; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_wa</code> Title: Colossal OSCAR 1 [wa; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wa/#colossal-oscar-1-wa-2023-14","title":"Colossal OSCAR 1 [wa; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_wa</code> Title: Colossal OSCAR 1 [wa; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wa/#colossal-oscar-1-wa-2023-23","title":"Colossal OSCAR 1 [wa; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_wa</code> Title: Colossal OSCAR 1 [wa; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_war/","title":"War Datasets","text":"<p>There are in total 12 datasets with N/A tokens in War language.</p>"},{"location":"datasets/language_war/#colossal-oscar-1-war-2015-14","title":"Colossal OSCAR 1 [war; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_war</code> Title: Colossal OSCAR 1 [war; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_war/#colossal-oscar-1-war-2016-40","title":"Colossal OSCAR 1 [war; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_war</code> Title: Colossal OSCAR 1 [war; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_war/#colossal-oscar-1-war-2017-43","title":"Colossal OSCAR 1 [war; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_war</code> Title: Colossal OSCAR 1 [war; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_war/#colossal-oscar-1-war-2018-47","title":"Colossal OSCAR 1 [war; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_war</code> Title: Colossal OSCAR 1 [war; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_war/#colossal-oscar-1-war-2019-22","title":"Colossal OSCAR 1 [war; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_war</code> Title: Colossal OSCAR 1 [war; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_war/#colossal-oscar-1-war-2020-24","title":"Colossal OSCAR 1 [war; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_war</code> Title: Colossal OSCAR 1 [war; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_war/#colossal-oscar-1-war-2020-45","title":"Colossal OSCAR 1 [war; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_war</code> Title: Colossal OSCAR 1 [war; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_war/#colossal-oscar-1-war-2021-49","title":"Colossal OSCAR 1 [war; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_war</code> Title: Colossal OSCAR 1 [war; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_war/#colossal-oscar-1-war-2022-27","title":"Colossal OSCAR 1 [war; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_war</code> Title: Colossal OSCAR 1 [war; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_war/#colossal-oscar-1-war-2022-49","title":"Colossal OSCAR 1 [war; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_war</code> Title: Colossal OSCAR 1 [war; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_war/#colossal-oscar-1-war-2023-14","title":"Colossal OSCAR 1 [war; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_war</code> Title: Colossal OSCAR 1 [war; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_war/#colossal-oscar-1-war-2023-23","title":"Colossal OSCAR 1 [war; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_war</code> Title: Colossal OSCAR 1 [war; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_wuu/","title":"Wuu Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Wuu language.</p>"},{"location":"datasets/language_wuu/#colossal-oscar-1-wuu-2015-14","title":"Colossal OSCAR 1 [wuu; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_wuu</code> Title: Colossal OSCAR 1 [wuu; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wuu/#colossal-oscar-1-wuu-2016-40","title":"Colossal OSCAR 1 [wuu; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_wuu</code> Title: Colossal OSCAR 1 [wuu; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wuu/#colossal-oscar-1-wuu-2017-43","title":"Colossal OSCAR 1 [wuu; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_wuu</code> Title: Colossal OSCAR 1 [wuu; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wuu/#colossal-oscar-1-wuu-2018-47","title":"Colossal OSCAR 1 [wuu; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_wuu</code> Title: Colossal OSCAR 1 [wuu; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wuu/#colossal-oscar-1-wuu-2019-22","title":"Colossal OSCAR 1 [wuu; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_wuu</code> Title: Colossal OSCAR 1 [wuu; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wuu/#colossal-oscar-1-wuu-2020-24","title":"Colossal OSCAR 1 [wuu; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_wuu</code> Title: Colossal OSCAR 1 [wuu; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wuu/#colossal-oscar-1-wuu-2020-45","title":"Colossal OSCAR 1 [wuu; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_wuu</code> Title: Colossal OSCAR 1 [wuu; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wuu/#colossal-oscar-1-wuu-2021-49","title":"Colossal OSCAR 1 [wuu; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_wuu</code> Title: Colossal OSCAR 1 [wuu; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wuu/#colossal-oscar-1-wuu-2022-27","title":"Colossal OSCAR 1 [wuu; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_wuu</code> Title: Colossal OSCAR 1 [wuu; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wuu/#colossal-oscar-1-wuu-2022-49","title":"Colossal OSCAR 1 [wuu; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_wuu</code> Title: Colossal OSCAR 1 [wuu; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wuu/#colossal-oscar-1-wuu-2023-14","title":"Colossal OSCAR 1 [wuu; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_wuu</code> Title: Colossal OSCAR 1 [wuu; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_wuu/#colossal-oscar-1-wuu-2023-23","title":"Colossal OSCAR 1 [wuu; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_wuu</code> Title: Colossal OSCAR 1 [wuu; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_x-eml/","title":"X-Eml Datasets","text":"<p>There are in total 12 datasets with N/A tokens in X-Eml language.</p>"},{"location":"datasets/language_x-eml/#colossal-oscar-1-x-eml-2015-14","title":"Colossal OSCAR 1 [x-eml; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_x-eml</code> Title: Colossal OSCAR 1 [x-eml; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_x-eml/#colossal-oscar-1-x-eml-2016-40","title":"Colossal OSCAR 1 [x-eml; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_x-eml</code> Title: Colossal OSCAR 1 [x-eml; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_x-eml/#colossal-oscar-1-x-eml-2017-43","title":"Colossal OSCAR 1 [x-eml; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_x-eml</code> Title: Colossal OSCAR 1 [x-eml; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_x-eml/#colossal-oscar-1-x-eml-2018-47","title":"Colossal OSCAR 1 [x-eml; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_x-eml</code> Title: Colossal OSCAR 1 [x-eml; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_x-eml/#colossal-oscar-1-x-eml-2019-22","title":"Colossal OSCAR 1 [x-eml; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_x-eml</code> Title: Colossal OSCAR 1 [x-eml; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_x-eml/#colossal-oscar-1-x-eml-2020-24","title":"Colossal OSCAR 1 [x-eml; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_x-eml</code> Title: Colossal OSCAR 1 [x-eml; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_x-eml/#colossal-oscar-1-x-eml-2020-45","title":"Colossal OSCAR 1 [x-eml; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_x-eml</code> Title: Colossal OSCAR 1 [x-eml; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_x-eml/#colossal-oscar-1-x-eml-2021-49","title":"Colossal OSCAR 1 [x-eml; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_x-eml</code> Title: Colossal OSCAR 1 [x-eml; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_x-eml/#colossal-oscar-1-x-eml-2022-27","title":"Colossal OSCAR 1 [x-eml; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_x-eml</code> Title: Colossal OSCAR 1 [x-eml; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_x-eml/#colossal-oscar-1-x-eml-2022-49","title":"Colossal OSCAR 1 [x-eml; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_x-eml</code> Title: Colossal OSCAR 1 [x-eml; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_x-eml/#colossal-oscar-1-x-eml-2023-14","title":"Colossal OSCAR 1 [x-eml; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_x-eml</code> Title: Colossal OSCAR 1 [x-eml; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_x-eml/#colossal-oscar-1-x-eml-2023-23","title":"Colossal OSCAR 1 [x-eml; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_x-eml</code> Title: Colossal OSCAR 1 [x-eml; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_xal/","title":"Xal Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Xal language.</p>"},{"location":"datasets/language_xal/#colossal-oscar-1-xal-2015-14","title":"Colossal OSCAR 1 [xal; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_xal</code> Title: Colossal OSCAR 1 [xal; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xal/#colossal-oscar-1-xal-2016-40","title":"Colossal OSCAR 1 [xal; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_xal</code> Title: Colossal OSCAR 1 [xal; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xal/#colossal-oscar-1-xal-2017-43","title":"Colossal OSCAR 1 [xal; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_xal</code> Title: Colossal OSCAR 1 [xal; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xal/#colossal-oscar-1-xal-2018-47","title":"Colossal OSCAR 1 [xal; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_xal</code> Title: Colossal OSCAR 1 [xal; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xal/#colossal-oscar-1-xal-2019-22","title":"Colossal OSCAR 1 [xal; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_xal</code> Title: Colossal OSCAR 1 [xal; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xal/#colossal-oscar-1-xal-2020-24","title":"Colossal OSCAR 1 [xal; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_xal</code> Title: Colossal OSCAR 1 [xal; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xal/#colossal-oscar-1-xal-2020-45","title":"Colossal OSCAR 1 [xal; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_xal</code> Title: Colossal OSCAR 1 [xal; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xal/#colossal-oscar-1-xal-2021-49","title":"Colossal OSCAR 1 [xal; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_xal</code> Title: Colossal OSCAR 1 [xal; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xal/#colossal-oscar-1-xal-2022-27","title":"Colossal OSCAR 1 [xal; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_xal</code> Title: Colossal OSCAR 1 [xal; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xal/#colossal-oscar-1-xal-2022-49","title":"Colossal OSCAR 1 [xal; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_xal</code> Title: Colossal OSCAR 1 [xal; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xal/#colossal-oscar-1-xal-2023-14","title":"Colossal OSCAR 1 [xal; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_xal</code> Title: Colossal OSCAR 1 [xal; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xal/#colossal-oscar-1-xal-2023-23","title":"Colossal OSCAR 1 [xal; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_xal</code> Title: Colossal OSCAR 1 [xal; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_xh/","title":"Xhosa Datasets","text":"<p>There are in total 1 datasets with N/A tokens in Xhosa language.</p>"},{"location":"datasets/language_xh/#wura-xhosa","title":"WURA [Xhosa]","text":"Dataset ID: <code>wura_xh</code> Title: WURA [Xhosa] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_xmf/","title":"Xmf Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Xmf language.</p>"},{"location":"datasets/language_xmf/#colossal-oscar-1-xmf-2015-14","title":"Colossal OSCAR 1 [xmf; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_xmf</code> Title: Colossal OSCAR 1 [xmf; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xmf/#colossal-oscar-1-xmf-2016-40","title":"Colossal OSCAR 1 [xmf; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_xmf</code> Title: Colossal OSCAR 1 [xmf; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xmf/#colossal-oscar-1-xmf-2017-43","title":"Colossal OSCAR 1 [xmf; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_xmf</code> Title: Colossal OSCAR 1 [xmf; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xmf/#colossal-oscar-1-xmf-2018-47","title":"Colossal OSCAR 1 [xmf; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_xmf</code> Title: Colossal OSCAR 1 [xmf; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xmf/#colossal-oscar-1-xmf-2019-22","title":"Colossal OSCAR 1 [xmf; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_xmf</code> Title: Colossal OSCAR 1 [xmf; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xmf/#colossal-oscar-1-xmf-2020-24","title":"Colossal OSCAR 1 [xmf; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_xmf</code> Title: Colossal OSCAR 1 [xmf; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xmf/#colossal-oscar-1-xmf-2020-45","title":"Colossal OSCAR 1 [xmf; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_xmf</code> Title: Colossal OSCAR 1 [xmf; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xmf/#colossal-oscar-1-xmf-2021-49","title":"Colossal OSCAR 1 [xmf; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_xmf</code> Title: Colossal OSCAR 1 [xmf; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xmf/#colossal-oscar-1-xmf-2022-27","title":"Colossal OSCAR 1 [xmf; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_xmf</code> Title: Colossal OSCAR 1 [xmf; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xmf/#colossal-oscar-1-xmf-2022-49","title":"Colossal OSCAR 1 [xmf; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_xmf</code> Title: Colossal OSCAR 1 [xmf; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xmf/#colossal-oscar-1-xmf-2023-14","title":"Colossal OSCAR 1 [xmf; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_xmf</code> Title: Colossal OSCAR 1 [xmf; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_xmf/#colossal-oscar-1-xmf-2023-23","title":"Colossal OSCAR 1 [xmf; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_xmf</code> Title: Colossal OSCAR 1 [xmf; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_yi/","title":"Yiddish Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Yiddish language.</p>"},{"location":"datasets/language_yi/#colossal-oscar-1-yi-2015-14","title":"Colossal OSCAR 1 [yi; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_yi</code> Title: Colossal OSCAR 1 [yi; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yi/#colossal-oscar-1-yi-2016-40","title":"Colossal OSCAR 1 [yi; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_yi</code> Title: Colossal OSCAR 1 [yi; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yi/#colossal-oscar-1-yi-2017-43","title":"Colossal OSCAR 1 [yi; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_yi</code> Title: Colossal OSCAR 1 [yi; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yi/#colossal-oscar-1-yi-2018-47","title":"Colossal OSCAR 1 [yi; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_yi</code> Title: Colossal OSCAR 1 [yi; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yi/#colossal-oscar-1-yi-2019-22","title":"Colossal OSCAR 1 [yi; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_yi</code> Title: Colossal OSCAR 1 [yi; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yi/#colossal-oscar-1-yi-2020-24","title":"Colossal OSCAR 1 [yi; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_yi</code> Title: Colossal OSCAR 1 [yi; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yi/#colossal-oscar-1-yi-2020-45","title":"Colossal OSCAR 1 [yi; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_yi</code> Title: Colossal OSCAR 1 [yi; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yi/#colossal-oscar-1-yi-2021-49","title":"Colossal OSCAR 1 [yi; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_yi</code> Title: Colossal OSCAR 1 [yi; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yi/#colossal-oscar-1-yi-2022-27","title":"Colossal OSCAR 1 [yi; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_yi</code> Title: Colossal OSCAR 1 [yi; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yi/#colossal-oscar-1-yi-2022-49","title":"Colossal OSCAR 1 [yi; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_yi</code> Title: Colossal OSCAR 1 [yi; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yi/#colossal-oscar-1-yi-2023-14","title":"Colossal OSCAR 1 [yi; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_yi</code> Title: Colossal OSCAR 1 [yi; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yi/#colossal-oscar-1-yi-2023-23","title":"Colossal OSCAR 1 [yi; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_yi</code> Title: Colossal OSCAR 1 [yi; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_yo/","title":"Yoruba Datasets","text":"<p>There are in total 13 datasets with N/A tokens in Yoruba language.</p>"},{"location":"datasets/language_yo/#colossal-oscar-1-yo-2015-14","title":"Colossal OSCAR 1 [yo; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_yo</code> Title: Colossal OSCAR 1 [yo; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yo/#colossal-oscar-1-yo-2016-40","title":"Colossal OSCAR 1 [yo; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_yo</code> Title: Colossal OSCAR 1 [yo; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yo/#colossal-oscar-1-yo-2017-43","title":"Colossal OSCAR 1 [yo; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_yo</code> Title: Colossal OSCAR 1 [yo; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yo/#colossal-oscar-1-yo-2018-47","title":"Colossal OSCAR 1 [yo; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_yo</code> Title: Colossal OSCAR 1 [yo; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yo/#colossal-oscar-1-yo-2019-22","title":"Colossal OSCAR 1 [yo; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_yo</code> Title: Colossal OSCAR 1 [yo; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yo/#colossal-oscar-1-yo-2020-24","title":"Colossal OSCAR 1 [yo; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_yo</code> Title: Colossal OSCAR 1 [yo; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yo/#colossal-oscar-1-yo-2020-45","title":"Colossal OSCAR 1 [yo; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_yo</code> Title: Colossal OSCAR 1 [yo; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yo/#colossal-oscar-1-yo-2021-49","title":"Colossal OSCAR 1 [yo; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_yo</code> Title: Colossal OSCAR 1 [yo; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yo/#colossal-oscar-1-yo-2022-27","title":"Colossal OSCAR 1 [yo; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_yo</code> Title: Colossal OSCAR 1 [yo; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yo/#colossal-oscar-1-yo-2022-49","title":"Colossal OSCAR 1 [yo; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_yo</code> Title: Colossal OSCAR 1 [yo; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yo/#colossal-oscar-1-yo-2023-14","title":"Colossal OSCAR 1 [yo; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_yo</code> Title: Colossal OSCAR 1 [yo; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yo/#colossal-oscar-1-yo-2023-23","title":"Colossal OSCAR 1 [yo; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_yo</code> Title: Colossal OSCAR 1 [yo; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_yo/#wura-yoruba","title":"WURA [Yoruba]","text":"Dataset ID: <code>wura_yo</code> Title: WURA [Yoruba] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_zh/","title":"Chinese Datasets","text":"<p>There are in total 12 datasets with N/A tokens in Chinese language.</p>"},{"location":"datasets/language_zh/#colossal-oscar-1-zh-2015-14","title":"Colossal OSCAR 1 [zh; 2015-14]","text":"Dataset ID: <code>colossal_oscar_2015-14_zh</code> Title: Colossal OSCAR 1 [zh; 2015-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_zh/#colossal-oscar-1-zh-2016-40","title":"Colossal OSCAR 1 [zh; 2016-40]","text":"Dataset ID: <code>colossal_oscar_2016-40_zh</code> Title: Colossal OSCAR 1 [zh; 2016-40] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_zh/#colossal-oscar-1-zh-2017-43","title":"Colossal OSCAR 1 [zh; 2017-43]","text":"Dataset ID: <code>colossal_oscar_2017-43_zh</code> Title: Colossal OSCAR 1 [zh; 2017-43] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_zh/#colossal-oscar-1-zh-2018-47","title":"Colossal OSCAR 1 [zh; 2018-47]","text":"Dataset ID: <code>colossal_oscar_2018-47_zh</code> Title: Colossal OSCAR 1 [zh; 2018-47] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_zh/#colossal-oscar-1-zh-2019-22","title":"Colossal OSCAR 1 [zh; 2019-22]","text":"Dataset ID: <code>colossal_oscar_2019-22_zh</code> Title: Colossal OSCAR 1 [zh; 2019-22] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_zh/#colossal-oscar-1-zh-2020-24","title":"Colossal OSCAR 1 [zh; 2020-24]","text":"Dataset ID: <code>colossal_oscar_2020-24_zh</code> Title: Colossal OSCAR 1 [zh; 2020-24] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_zh/#colossal-oscar-1-zh-2020-45","title":"Colossal OSCAR 1 [zh; 2020-45]","text":"Dataset ID: <code>colossal_oscar_2020-45_zh</code> Title: Colossal OSCAR 1 [zh; 2020-45] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_zh/#colossal-oscar-1-zh-2021-49","title":"Colossal OSCAR 1 [zh; 2021-49]","text":"Dataset ID: <code>colossal_oscar_2021-49_zh</code> Title: Colossal OSCAR 1 [zh; 2021-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_zh/#colossal-oscar-1-zh-2022-27","title":"Colossal OSCAR 1 [zh; 2022-27]","text":"Dataset ID: <code>colossal_oscar_2022-27_zh</code> Title: Colossal OSCAR 1 [zh; 2022-27] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_zh/#colossal-oscar-1-zh-2022-49","title":"Colossal OSCAR 1 [zh; 2022-49]","text":"Dataset ID: <code>colossal_oscar_2022-49_zh</code> Title: Colossal OSCAR 1 [zh; 2022-49] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_zh/#colossal-oscar-1-zh-2023-14","title":"Colossal OSCAR 1 [zh; 2023-14]","text":"Dataset ID: <code>colossal_oscar_2023-14_zh</code> Title: Colossal OSCAR 1 [zh; 2023-14] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A"},{"location":"datasets/language_zh/#colossal-oscar-1-zh-2023-23","title":"Colossal OSCAR 1 [zh; 2023-23]","text":"Dataset ID: <code>colossal_oscar_2023-23_zh</code> Title: Colossal OSCAR 1 [zh; 2023-23] Description: The OSCAR project (Open Super-large Crawled Aggregated coRpus) is an Open Source project aiming to provide web-based multilingual resources and datasets for Machine Learning (ML) and Artificial Intelligence (AI) applications. The project focuses specifically in providing large quantities of unannotated raw data that is commonly used in the pre-training of large deep learning models. Availibility: <code>signin_download</code> Homepage: [https://huggingface.co/datasets/oscar-corpus/colossal-oscar-1.0] License: CommonCrawl terms of use; Only the annotations are distributed under a cc0-1.0 license (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"},{"location":"datasets/language_zu/","title":"Zu Datasets","text":"<p>There are in total 1 datasets with N/A tokens in Zu language.</p>"},{"location":"datasets/language_zu/#wura-zulu","title":"WURA [Zulu]","text":"Dataset ID: <code>wura_zu</code> Title: WURA [Zulu] Description: Wura is large-scale pretraining data for 20 languages popularly spoken in Africa. Availibility: <code>direct_download</code> Homepage: [https://huggingface.co/datasets/castorini/wura] License: Apache License Version 2.0 (commercial use: True, sharealike: None) Tokens: N/A <p>This page is automatically generated.</p>"}]}
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
new file mode 100644
index 0000000..58d4efa
--- /dev/null
+++ b/sitemap.xml
@@ -0,0 +1,888 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/add-your-own-data/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/compose-train-validation-data/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/config-files/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/extract-text-data/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/getting-started/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/overview/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/related-work/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/api/base_dataset/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/api/config/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/api/hf_dataset/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/api/jsonl_dataset/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_af/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_am/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_an/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ar/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_arz/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_as/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ast/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_av/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_az/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_azb/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ba/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_be/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_bg/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_bh/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_bn/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_bo/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_bpy/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_br/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_bs/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_bxr/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ca/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ce/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ceb/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ckb/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_code/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_cs/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_cv/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_cy/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_da/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_de/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_dsb/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_dv/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_el/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_en/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_eo/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_es/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_et/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_eu/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_fa/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_fi/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_fr/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_fy/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ga/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_gd/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_gl/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_gn/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_gom/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_gsw/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_gu/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ha/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_he/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_hi/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_hr/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_hsb/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ht/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_hu/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_hy/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ia/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_id/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ie/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ig/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ilo/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_io/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_is/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_it/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ja/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_jbo/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_jv/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ka/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_kk/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_km/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_kn/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ko/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_krc/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ku/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_kv/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_kw/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ky/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_la/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_lb/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_lez/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_li/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_lmo/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_lo/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_lt/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_lv/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_mai/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_mg/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_mhr/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_min/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_mk/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ml/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_mn/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_mr/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_mrj/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ms/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_mt/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_multi/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_mwl/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_my/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_mzn/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_nah/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_nds/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ne/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_new/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_nl/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_nn/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_no/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ny/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_oc/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_om/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_or/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_os/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_pa/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_pl/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_pms/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_pnb/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ps/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_pt/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_qu/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ro/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ru/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_rw/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_sa/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_sah/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_sd/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_sh/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_si/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_sk/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_sl/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_sn/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_so/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_sq/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_sr/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_st/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_su/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_sv/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_sw/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ta/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_te/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_tg/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_th/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ti/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_tk/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_tl/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_tr/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_tt/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ug/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_uk/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_ur/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_uz/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_vi/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_vo/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_wa/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_war/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_wuu/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_x-eml/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_xal/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_xh/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_xmf/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_yi/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_yo/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_zh/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://github.com/malteos/llm-datasets/datasets/language_zu/</loc>
+         <lastmod>2024-07-18</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+</urlset>
\ No newline at end of file
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
new file mode 100644
index 0000000000000000000000000000000000000000..77a063e198579d82af679adf12f6b39a2a0fd720
GIT binary patch
literal 894
zcmV-^1A+V>iwFn+TbO17|8r?{Wo=<_E_iKh0M(t#Zrd;rMfdp%!+TjyP@rkyIJ<s9
zyACLkl4$Xva45_6*OzvJAo~JEjyF+cMLr<NJ9ma7XZL*aZgn(CYz$B9`^|Q}GNFyy
zhW=^&=l8Gbas9l%+s(;|$*Xd4kf-&K^YpW>mu1<A&Bm)qHZk?Jv_ttt>i3iNcJpJs
zzgv~p^#*T8x7W_ac7MH#l-u++4|B6=qp!Vk*+i+G^OaUvmCuxVS6{FG{KrbNk9z;G
zefU;wzgPE<<&yVvpDt?$6?)Sp^Sal{y7R97bmon!{=oVdK&5qcj&rKw5^iU|K5}_u
ziejpqlnvEUIa}T;Qvib>I@?#Bbp|j$%&81>o2y(7AOqZ+oQnihl2yt^e|_LsH0)^2
z^7ToRajFbywZt?6v~99=Qx0F=&oGGJ^nt-TbgZ#GNDPjH%&`$42d><HR=s&q9g^B3
zsRNQaF{v*|Y^I1vMq(e4)Xa%}X^7a3LQ)$fb&sSTkkm*k(nKWnbVgz$eP4q#P|XQB
z*ex<TwuVWKG*Iox#EzOdOOW!{A``I|8P+vYvze<=3sU?vayc4@q(&xU2APN%WN<Xd
z+Nzm})EzQ7b{0vEOnEz`>Fbg6y4Of*qz~zlKBSMxMu1)xBsMap4@m7BkO}XAbRq-N
zi44r*!hlR?2B!2O&8kHTwnds%iwrXs*(qWjvTA0T)W}+&MFz(MQm_xqZlVKnb7n+N
zk#R&)BWoffvH&$AHDpXJ5*wL>jZDEtRwzeg!t0R1(ILxE&LAJ3amd1|^T_bX%+nn*
zcXY_;=oNB3dSu<qGxuk_MP@V}ne%#N(}icw5ax1B2P8JKj_r|N#Lq4><NCssKV-J<
zFG$r63i+UV&`1FchDnXA`URx=1*VK3Ya;;}X(Do$hS`!Gkx5NN2FD4x-!UPh%*0F}
zr-0mDnwW!)%+_C#-2e&cpAvJdk=cwWCN;9WAjqzLVU9JjwkpW|FhQDCL6$@W8Peqy
zNsUa?nO~1BNYf|Ch9c${0%W9B&B(o(jO^XYNT<r|0?0_GnvqU5BYj9l`j8o^d^2)2
z&d7N^BcJIwBBRU^ne8r$NsU~N%Z$t%PnBV|k)IS38<`uNJn}y#o{_=kj0`px<ix)G
U2hqO$C?C833ytMr8x?*40PhL5IsgCw

literal 0
HcmV?d00001